diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -102,8 +102,9 @@
 
         * Running a serial build will be **slow**.  To improve speed, try running a
           parallel build.  That's done by default in Ninja; for ``make``, use the option
-          ``-j NNN``, where ``NNN`` is the number of parallel jobs, e.g. the number of
-          CPUs you have.
+          ``-j NNN``, where ``NNN`` is the number of parallel jobs to run.
+          In most cases, you get the best performance if you specify the number of CPU threads you have.
+          On some Unix systems, you can specify this with ``-j$(nproc)``.
 
       * For more information see [CMake](https://llvm.org/docs/CMake.html)
 
diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst
--- a/clang-tools-extra/docs/clang-tidy/Contributing.rst
+++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst
@@ -364,6 +364,11 @@
 test.  The ``ASTMatchersTests`` target contains unit tests for the public AST matcher
 classes and is a good source of testing idioms for matchers.
 
+You can build the Clang-tidy unit tests by building the ``ClangTidyTests`` target.
+Test targets in LLVM and Clang are excluded from the "build all" style action of
+IDE-based CMake generators, so you need to explicitly build the target for the unit
+tests to be built.
+
 Making your check robust
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/docs/ClangFormattedStatus.rst b/clang/docs/ClangFormattedStatus.rst
--- a/clang/docs/ClangFormattedStatus.rst
+++ b/clang/docs/ClangFormattedStatus.rst
@@ -17,7 +17,7 @@
 ======================
 
 :doc:`ClangFormattedStatus` describes the state of LLVM source
-tree in terms of conformance to :doc:`ClangFormat` as of: January 03, 2022 11:33:59 (`cd2b050fa499 <https://github.com/llvm/llvm-project/commit/cd2b050fa499>`_).
+tree in terms of conformance to :doc:`ClangFormat` as of: March 06, 2022 17:32:26 (`830ba4cebe79 <https://github.com/llvm/llvm-project/commit/830ba4cebe79>`_).
 
 
 .. list-table:: LLVM Clang-Format Status
@@ -29,6 +29,106 @@
      - Formatted Files
      - Unformatted Files
      - % Complete
+   * - bolt/include/bolt/Core
+     - `15`
+     - `10`
+     - `5`
+     - :part:`66%`
+   * - bolt/include/bolt/Passes
+     - `47`
+     - `47`
+     - `0`
+     - :good:`100%`
+   * - bolt/include/bolt/Profile
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - bolt/include/bolt/Rewrite
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - bolt/include/bolt/RuntimeLibs
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - bolt/include/bolt/Utils
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - bolt/lib/Core
+     - `14`
+     - `5`
+     - `9`
+     - :part:`35%`
+   * - bolt/lib/Passes
+     - `45`
+     - `21`
+     - `24`
+     - :part:`46%`
+   * - bolt/lib/Profile
+     - `7`
+     - `3`
+     - `4`
+     - :part:`42%`
+   * - bolt/lib/Rewrite
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - bolt/lib/RuntimeLibs
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - bolt/lib/Target/AArch64
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - bolt/lib/Target/X86
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - bolt/lib/Utils
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - bolt/runtime
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - bolt/tools/driver
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - bolt/tools/heatmap
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - bolt/tools/llvm-bolt-fuzzer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - bolt/tools/merge-fdata
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - bolt/unittests/Core
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - clang/bindings/python/tests/cindex/INPUTS
      - `5`
      - `3`
@@ -80,10 +180,10 @@
      - `2`
      - :none:`0%`
    * - clang/include/clang/Analysis/FlowSensitive
-     - `7`
-     - `6`
+     - `16`
+     - `15`
      - `1`
-     - :part:`85%`
+     - :part:`93%`
    * - clang/include/clang/Analysis/Support
      - `1`
      - `0`
@@ -251,12 +351,12 @@
      - :none:`0%`
    * - clang/include/clang/Tooling/DependencyScanning
      - `5`
-     - `4`
-     - `1`
-     - :part:`80%`
+     - `5`
+     - `0`
+     - :good:`100%`
    * - clang/include/clang/Tooling/Inclusions
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - clang/include/clang/Tooling/Refactoring
@@ -279,6 +379,11 @@
      - `5`
      - `0`
      - :good:`100%`
+   * - clang/include/clang/Tooling/Syntax/Pseudo
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
    * - clang/include/clang/Tooling/Transformer
      - `8`
      - `6`
@@ -296,12 +401,12 @@
      - :none:`0%`
    * - clang/lib/Analysis
      - `28`
-     - `4`
-     - `24`
-     - :part:`14%`
+     - `3`
+     - `25`
+     - :part:`10%`
    * - clang/lib/Analysis/FlowSensitive
-     - `2`
-     - `2`
+     - `7`
+     - `7`
      - `0`
      - :good:`100%`
    * - clang/lib/Analysis/plugins/CheckerDependencyHandling
@@ -361,9 +466,9 @@
      - :part:`50%`
    * - clang/lib/CodeGen
      - `87`
-     - `10`
-     - `77`
-     - :part:`11%`
+     - `9`
+     - `78`
+     - :part:`10%`
    * - clang/lib/CrossTU
      - `1`
      - `0`
@@ -400,9 +505,9 @@
      - `12`
      - :part:`14%`
    * - clang/lib/Driver/ToolChains
-     - `95`
+     - `94`
      - `41`
-     - `54`
+     - `53`
      - :part:`43%`
    * - clang/lib/Driver/ToolChains/Arch
      - `20`
@@ -415,15 +520,15 @@
      - `3`
      - :none:`0%`
    * - clang/lib/Format
-     - `33`
-     - `33`
+     - `35`
+     - `35`
      - `0`
      - :good:`100%`
    * - clang/lib/Frontend
      - `32`
-     - `3`
-     - `29`
-     - :part:`9%`
+     - `4`
+     - `28`
+     - :part:`12%`
    * - clang/lib/Frontend/Rewrite
      - `8`
      - `0`
@@ -436,14 +541,14 @@
      - :none:`0%`
    * - clang/lib/Headers
      - `146`
-     - `16`
-     - `130`
-     - :part:`10%`
+     - `14`
+     - `132`
+     - :part:`9%`
    * - clang/lib/Headers/openmp_wrappers
      - `5`
-     - `5`
-     - `0`
-     - :good:`100%`
+     - `4`
+     - `1`
+     - :part:`80%`
    * - clang/lib/Headers/ppc_wrappers
      - `7`
      - `2`
@@ -465,9 +570,9 @@
      - `0`
      - :good:`100%`
    * - clang/lib/Lex
-     - `23`
+     - `24`
      - `1`
-     - `22`
+     - `23`
      - :part:`4%`
    * - clang/lib/Parse
      - `15`
@@ -481,19 +586,19 @@
      - :none:`0%`
    * - clang/lib/Sema
      - `55`
-     - `5`
-     - `50`
-     - :part:`9%`
+     - `4`
+     - `51`
+     - :part:`7%`
    * - clang/lib/Serialization
      - `17`
      - `2`
      - `15`
      - :part:`11%`
    * - clang/lib/StaticAnalyzer/Checkers
-     - `118`
-     - `16`
-     - `102`
-     - :part:`13%`
+     - `122`
+     - `19`
+     - `103`
+     - :part:`15%`
    * - clang/lib/StaticAnalyzer/Checkers/cert
      - `2`
      - `2`
@@ -551,17 +656,17 @@
      - :none:`0%`
    * - clang/lib/Tooling/DependencyScanning
      - `5`
-     - `3`
-     - `2`
-     - :part:`60%`
+     - `4`
+     - `1`
+     - :part:`80%`
    * - clang/lib/Tooling/DumpTool
      - `4`
      - `3`
      - `1`
      - :part:`75%`
    * - clang/lib/Tooling/Inclusions
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - clang/lib/Tooling/Refactoring
@@ -584,6 +689,11 @@
      - `6`
      - `1`
      - :part:`85%`
+   * - clang/lib/Tooling/Syntax/Pseudo
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
    * - clang/lib/Tooling/Transformer
      - `7`
      - `4`
@@ -669,6 +779,11 @@
      - `0`
      - `1`
      - :none:`0%`
+   * - clang/tools/clang-linker-wrapper
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
    * - clang/tools/clang-nvlink-wrapper
      - `1`
      - `1`
@@ -684,6 +799,11 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - clang/tools/clang-pseudo
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - clang/tools/clang-refactor
      - `4`
      - `4`
@@ -735,10 +855,10 @@
      - `4`
      - :part:`33%`
    * - clang/unittests/Analysis/FlowSensitive
-     - `5`
-     - `5`
-     - `0`
-     - :good:`100%`
+     - `14`
+     - `13`
+     - `1`
+     - :part:`92%`
    * - clang/unittests/AST
      - `30`
      - `8`
@@ -780,8 +900,8 @@
      - `4`
      - :part:`20%`
    * - clang/unittests/Format
-     - `23`
-     - `23`
+     - `24`
+     - `24`
      - `0`
      - :good:`100%`
    * - clang/unittests/Frontend
@@ -810,10 +930,10 @@
      - `1`
      - :none:`0%`
    * - clang/unittests/Lex
-     - `7`
-     - `3`
+     - `8`
      - `4`
-     - :part:`42%`
+     - `4`
+     - :part:`50%`
    * - clang/unittests/libclang
      - `2`
      - `0`
@@ -850,10 +970,10 @@
      - `9`
      - :part:`43%`
    * - clang/unittests/Tooling
-     - `29`
-     - `8`
-     - `21`
-     - :part:`27%`
+     - `30`
+     - `10`
+     - `20`
+     - :part:`33%`
    * - clang/unittests/Tooling/RecursiveASTVisitorTests
      - `30`
      - `12`
@@ -864,6 +984,11 @@
      - `3`
      - `4`
      - :part:`42%`
+   * - clang/unittests/Tooling/Syntax/Pseudo
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
    * - clang/utils/perf-training/cxx
      - `1`
      - `0`
@@ -965,10 +1090,10 @@
      - `1`
      - :none:`0%`
    * - clang-tools-extra/clang-tidy
-     - `18`
-     - `12`
+     - `20`
+     - `14`
      - `6`
-     - :part:`66%`
+     - :part:`70%`
    * - clang-tools-extra/clang-tidy/abseil
      - `42`
      - `31`
@@ -990,10 +1115,10 @@
      - `0`
      - :good:`100%`
    * - clang-tools-extra/clang-tidy/bugprone
-     - `121`
-     - `101`
-     - `20`
-     - :part:`83%`
+     - `125`
+     - `106`
+     - `19`
+     - :part:`84%`
    * - clang-tools-extra/clang-tidy/cert
      - `29`
      - `28`
@@ -1006,9 +1131,9 @@
      - :part:`80%`
    * - clang-tools-extra/clang-tidy/cppcoreguidelines
      - `45`
-     - `41`
-     - `4`
-     - :part:`91%`
+     - `42`
+     - `3`
+     - :part:`93%`
    * - clang-tools-extra/clang-tidy/darwin
      - `5`
      - `2`
@@ -1045,15 +1170,15 @@
      - `0`
      - :good:`100%`
    * - clang-tools-extra/clang-tidy/misc
-     - `31`
-     - `28`
+     - `33`
+     - `30`
      - `3`
      - :part:`90%`
    * - clang-tools-extra/clang-tidy/modernize
      - `67`
-     - `49`
-     - `18`
-     - :part:`73%`
+     - `48`
+     - `19`
+     - :part:`71%`
    * - clang-tools-extra/clang-tidy/mpi
      - `5`
      - `5`
@@ -1085,10 +1210,10 @@
      - `2`
      - :part:`60%`
    * - clang-tools-extra/clang-tidy/readability
-     - `83`
-     - `70`
-     - `13`
-     - :part:`84%`
+     - `88`
+     - `76`
+     - `12`
+     - :part:`86%`
    * - clang-tools-extra/clang-tidy/tool
      - `3`
      - `2`
@@ -1106,9 +1231,9 @@
      - :good:`100%`
    * - clang-tools-extra/clangd
      - `97`
-     - `83`
-     - `14`
-     - :part:`85%`
+     - `81`
+     - `16`
+     - :part:`83%`
    * - clang-tools-extra/clangd/benchmarks
      - `1`
      - `1`
@@ -1126,14 +1251,14 @@
      - :good:`100%`
    * - clang-tools-extra/clangd/index
      - `39`
-     - `37`
-     - `2`
-     - :part:`94%`
+     - `36`
+     - `3`
+     - :part:`92%`
    * - clang-tools-extra/clangd/index/dex
      - `9`
-     - `8`
-     - `1`
-     - :part:`88%`
+     - `7`
+     - `2`
+     - :part:`77%`
    * - clang-tools-extra/clangd/index/dex/dexp
      - `1`
      - `1`
@@ -1170,10 +1295,10 @@
      - `0`
      - :good:`100%`
    * - clang-tools-extra/clangd/refactor
-     - `4`
-     - `3`
+     - `6`
+     - `5`
      - `1`
-     - :part:`75%`
+     - :part:`83%`
    * - clang-tools-extra/clangd/refactor/tweaks
      - `14`
      - `10`
@@ -1190,8 +1315,8 @@
      - `0`
      - :good:`100%`
    * - clang-tools-extra/clangd/unittests
-     - `78`
-     - `65`
+     - `79`
+     - `66`
      - `13`
      - :part:`83%`
    * - clang-tools-extra/clangd/unittests/decision_forest_model
@@ -1211,9 +1336,9 @@
      - :good:`100%`
    * - clang-tools-extra/clangd/unittests/tweaks
      - `20`
-     - `20`
-     - `0`
-     - :good:`100%`
+     - `19`
+     - `1`
+     - :part:`95%`
    * - clang-tools-extra/clangd/unittests/xpc
      - `1`
      - `1`
@@ -1286,9 +1411,9 @@
      - :none:`0%`
    * - clang-tools-extra/unittests/clang-tidy
      - `16`
-     - `8`
-     - `8`
-     - :part:`50%`
+     - `9`
+     - `7`
+     - :part:`56%`
    * - clang-tools-extra/unittests/include/common
      - `1`
      - `0`
@@ -1310,10 +1435,10 @@
      - `1`
      - :part:`66%`
    * - compiler-rt/lib/asan
-     - `56`
-     - `4`
+     - `57`
+     - `5`
      - `52`
-     - :part:`7%`
+     - :part:`8%`
    * - compiler-rt/lib/asan/tests
      - `17`
      - `1`
@@ -1346,9 +1471,9 @@
      - :none:`0%`
    * - compiler-rt/lib/dfsan
      - `14`
-     - `10`
-     - `4`
-     - :part:`71%`
+     - `9`
+     - `5`
+     - :part:`64%`
    * - compiler-rt/lib/fuzzer
      - `47`
      - `9`
@@ -1395,10 +1520,10 @@
      - `0`
      - :good:`100%`
    * - compiler-rt/lib/hwasan
-     - `29`
-     - `8`
+     - `30`
+     - `9`
      - `21`
-     - :part:`27%`
+     - :part:`30%`
    * - compiler-rt/lib/interception
      - `8`
      - `1`
@@ -1415,10 +1540,10 @@
      - `16`
      - :part:`20%`
    * - compiler-rt/lib/memprof
-     - `32`
      - `31`
-     - `1`
-     - :part:`96%`
+     - `29`
+     - `2`
+     - :part:`93%`
    * - compiler-rt/lib/memprof/tests
      - `2`
      - `2`
@@ -1435,10 +1560,10 @@
      - `4`
      - :none:`0%`
    * - compiler-rt/lib/orc
-     - `19`
-     - `14`
+     - `21`
+     - `16`
      - `5`
-     - :part:`73%`
+     - :part:`76%`
    * - compiler-rt/lib/orc/unittests
      - `10`
      - `9`
@@ -1456,9 +1581,9 @@
      - :part:`33%`
    * - compiler-rt/lib/sanitizer_common
      - `167`
-     - `28`
-     - `139`
-     - :part:`16%`
+     - `29`
+     - `138`
+     - :part:`17%`
    * - compiler-rt/lib/sanitizer_common/symbolizer
      - `2`
      - `2`
@@ -1476,9 +1601,9 @@
      - :none:`0%`
    * - compiler-rt/lib/scudo/standalone
      - `49`
-     - `49`
-     - `0`
-     - :good:`100%`
+     - `48`
+     - `1`
+     - :part:`97%`
    * - compiler-rt/lib/scudo/standalone/benchmarks
      - `1`
      - `1`
@@ -1496,9 +1621,9 @@
      - :good:`100%`
    * - compiler-rt/lib/scudo/standalone/tests
      - `25`
-     - `25`
-     - `0`
-     - :good:`100%`
+     - `24`
+     - `1`
+     - :part:`96%`
    * - compiler-rt/lib/scudo/standalone/tools
      - `1`
      - `1`
@@ -1570,19 +1695,19 @@
      - `0`
      - :good:`100%`
    * - cross-project-tests/debuginfo-tests/clang_llvm_roundtrip
+     - `2`
      - `1`
-     - `0`
      - `1`
-     - :none:`0%`
+     - :part:`50%`
    * - cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty
-     - `8`
+     - `10`
      - `0`
-     - `8`
+     - `10`
      - :none:`0%`
    * - cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect
-     - `5`
+     - `7`
      - `0`
-     - `5`
+     - `7`
      - :none:`0%`
    * - cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address
      - `7`
@@ -1630,9 +1755,9 @@
      - `1`
      - :none:`0%`
    * - cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/clang-opt-bisect
-     - `1`
+     - `2`
      - `0`
-     - `1`
+     - `2`
      - :none:`0%`
    * - cross-project-tests/debuginfo-tests/dexter-tests
      - `15`
@@ -1654,7 +1779,7 @@
      - `1`
      - `0`
      - :good:`100%`
-   * - flang/examples/flang-omp-report-plugin
+   * - flang/examples/FlangOmpReport
      - `3`
      - `3`
      - `0`
@@ -1670,8 +1795,8 @@
      - `0`
      - :good:`100%`
    * - flang/include/flang/Common
-     - `20`
-     - `20`
+     - `21`
+     - `21`
      - `0`
      - :good:`100%`
    * - flang/include/flang/Decimal
@@ -1685,8 +1810,8 @@
      - `0`
      - :good:`100%`
    * - flang/include/flang/Frontend
+     - `11`
      - `10`
-     - `9`
      - `1`
      - :part:`90%`
    * - flang/include/flang/FrontendTool
@@ -1695,10 +1820,10 @@
      - `0`
      - :good:`100%`
    * - flang/include/flang/Lower
-     - `19`
-     - `19`
-     - `0`
-     - :good:`100%`
+     - `25`
+     - `24`
+     - `1`
+     - :part:`96%`
    * - flang/include/flang/Lower/Support
      - `2`
      - `2`
@@ -1710,8 +1835,8 @@
      - `0`
      - :good:`100%`
    * - flang/include/flang/Optimizer/Builder/Runtime
-     - `8`
-     - `8`
+     - `10`
+     - `10`
      - `0`
      - :good:`100%`
    * - flang/include/flang/Optimizer/CodeGen
@@ -1740,8 +1865,8 @@
      - `1`
      - :part:`94%`
    * - flang/include/flang/Runtime
+     - `28`
      - `27`
-     - `26`
      - `1`
      - :part:`96%`
    * - flang/include/flang/Semantics
@@ -1775,38 +1900,38 @@
      - `0`
      - :good:`100%`
    * - flang/lib/Lower
-     - `17`
-     - `16`
-     - `1`
-     - :part:`94%`
+     - `20`
+     - `20`
+     - `0`
+     - :good:`100%`
    * - flang/lib/Optimizer/Builder
      - `6`
      - `6`
      - `0`
      - :good:`100%`
    * - flang/lib/Optimizer/Builder/Runtime
-     - `7`
-     - `7`
+     - `9`
+     - `9`
      - `0`
      - :good:`100%`
    * - flang/lib/Optimizer/CodeGen
      - `10`
-     - `9`
-     - `1`
-     - :part:`90%`
+     - `10`
+     - `0`
+     - :good:`100%`
    * - flang/lib/Optimizer/Dialect
-     - `4`
-     - `3`
-     - `1`
-     - :part:`75%`
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
    * - flang/lib/Optimizer/Support
-     - `3`
-     - `3`
+     - `4`
+     - `4`
      - `0`
      - :good:`100%`
    * - flang/lib/Optimizer/Transforms
-     - `11`
-     - `11`
+     - `10`
+     - `10`
      - `0`
      - :good:`100%`
    * - flang/lib/Parser
@@ -1816,19 +1941,24 @@
      - :good:`100%`
    * - flang/lib/Semantics
      - `78`
-     - `73`
-     - `5`
-     - :part:`93%`
+     - `69`
+     - `9`
+     - :part:`88%`
    * - flang/module
      - `1`
      - `1`
      - `0`
      - :good:`100%`
    * - flang/runtime
-     - `73`
+     - `74`
      - `72`
+     - `2`
+     - :part:`97%`
+   * - flang/tools/bbc
+     - `1`
      - `1`
-     - :part:`98%`
+     - `0`
+     - :good:`100%`
    * - flang/tools/f18
      - `1`
      - `1`
@@ -1854,6 +1984,11 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - flang/unittests/Common
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - flang/unittests/Decimal
      - `2`
      - `2`
@@ -1880,13 +2015,13 @@
      - `0`
      - :good:`100%`
    * - flang/unittests/Optimizer/Builder/Runtime
-     - `8`
-     - `8`
+     - `10`
+     - `10`
      - `0`
      - :good:`100%`
    * - flang/unittests/Runtime
-     - `21`
-     - `21`
+     - `22`
+     - `22`
      - `0`
      - :good:`100%`
    * - libc/AOR_v20.02/math
@@ -1960,8 +2095,28 @@
      - `1`
      - :part:`66%`
    * - libc/include
-     - `3`
-     - `3`
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libc/include/llvm-libc-macros
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - libc/include/llvm-libc-macros/linux
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libc/include/llvm-libc-types
+     - `28`
+     - `28`
+     - `0`
+     - :good:`100%`
+   * - libc/loader/linux/aarch64
+     - `1`
+     - `1`
      - `0`
      - :good:`100%`
    * - libc/loader/linux/x86_64
@@ -1980,8 +2135,18 @@
      - `0`
      - :good:`100%`
    * - libc/src/errno
-     - `5`
-     - `5`
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - libc/src/fcntl
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - libc/src/fcntl/linux
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - libc/src/fenv
@@ -1995,8 +2160,8 @@
      - `0`
      - :good:`100%`
    * - libc/src/math
-     - `88`
-     - `88`
+     - `91`
+     - `91`
      - `0`
      - :good:`100%`
    * - libc/src/math/aarch64
@@ -2005,13 +2170,13 @@
      - `0`
      - :good:`100%`
    * - libc/src/math/generic
-     - `89`
-     - `89`
+     - `94`
+     - `94`
      - `0`
      - :good:`100%`
    * - libc/src/math/x86_64
-     - `6`
-     - `6`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - libc/src/signal
@@ -2030,13 +2195,13 @@
      - `0`
      - :good:`100%`
    * - libc/src/stdlib
-     - `41`
-     - `41`
+     - `46`
+     - `46`
      - `0`
      - :good:`100%`
    * - libc/src/stdlib/linux
-     - `1`
-     - `1`
+     - `2`
+     - `2`
      - `0`
      - :good:`100%`
    * - libc/src/string
@@ -2046,9 +2211,9 @@
      - :good:`100%`
    * - libc/src/string/memory_utils
      - `8`
-     - `8`
-     - `0`
-     - :good:`100%`
+     - `7`
+     - `1`
+     - :part:`87%`
    * - libc/src/sys/mman
      - `2`
      - `2`
@@ -2056,32 +2221,42 @@
      - :good:`100%`
    * - libc/src/sys/mman/linux
      - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - libc/src/sys/stat
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - libc/src/sys/stat/linux
+     - `2`
      - `2`
      - `0`
      - :good:`100%`
    * - libc/src/threads
-     - `12`
-     - `12`
+     - `16`
+     - `16`
      - `0`
      - :good:`100%`
    * - libc/src/threads/linux
-     - `16`
-     - `9`
+     - `11`
      - `7`
-     - :part:`56%`
+     - `4`
+     - :part:`63%`
    * - libc/src/time
      - `12`
      - `12`
      - `0`
      - :good:`100%`
    * - libc/src/unistd
-     - `1`
-     - `1`
+     - `7`
+     - `7`
      - `0`
      - :good:`100%`
    * - libc/src/unistd/linux
-     - `1`
-     - `1`
+     - `7`
+     - `7`
      - `0`
      - :good:`100%`
    * - libc/src/__support
@@ -2090,30 +2265,35 @@
      - `0`
      - :good:`100%`
    * - libc/src/__support/CPP
-     - `7`
-     - `7`
-     - `0`
-     - :good:`100%`
-   * - libc/src/__support/FPUtil
-     - `16`
-     - `16`
+     - `11`
+     - `10`
+     - `1`
+     - :part:`90%`
+   * - libc/src/__support/File
+     - `2`
+     - `2`
      - `0`
      - :good:`100%`
+   * - libc/src/__support/FPUtil
+     - `15`
+     - `14`
+     - `1`
+     - :part:`93%`
    * - libc/src/__support/FPUtil/aarch64
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - libc/src/__support/FPUtil/generic
-     - `1`
-     - `1`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - libc/src/__support/FPUtil/x86_64
      - `6`
-     - `6`
-     - `0`
-     - :good:`100%`
+     - `5`
+     - `1`
+     - :part:`83%`
    * - libc/src/__support/OSUtil
      - `3`
      - `3`
@@ -2121,7 +2301,12 @@
      - :good:`100%`
    * - libc/src/__support/OSUtil/linux
      - `3`
-     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - libc/src/__support/OSUtil/linux/aarch64
+     - `1`
+     - `1`
      - `0`
      - :good:`100%`
    * - libc/src/__support/OSUtil/linux/x86_64
@@ -2129,6 +2314,16 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - libc/src/__support/threads
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libc/src/__support/threads/linux
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - libc/utils/HdrGen
      - `9`
      - `9`
@@ -2160,10 +2355,10 @@
      - `0`
      - :good:`100%`
    * - libc/utils/UnitTest
-     - `10`
-     - `10`
-     - `0`
-     - :good:`100%`
+     - `12`
+     - `11`
+     - `1`
+     - :part:`91%`
    * - libclc/generic/include
      - `2`
      - `1`
@@ -2305,20 +2500,20 @@
      - `1`
      - :none:`0%`
    * - libcxx/benchmarks
-     - `27`
-     - `9`
+     - `28`
+     - `10`
      - `18`
-     - :part:`33%`
+     - :part:`35%`
    * - libcxx/include
-     - `23`
+     - `22`
      - `0`
-     - `23`
+     - `22`
      - :none:`0%`
    * - libcxx/include/__algorithm
-     - `93`
-     - `0`
-     - `93`
-     - :none:`0%`
+     - `102`
+     - `15`
+     - `87`
+     - :part:`14%`
    * - libcxx/include/__bit
      - `2`
      - `0`
@@ -2329,11 +2524,16 @@
      - `0`
      - `3`
      - :none:`0%`
-   * - libcxx/include/__compare
-     - `10`
+   * - libcxx/include/__chrono
+     - `8`
      - `0`
-     - `10`
+     - `8`
      - :none:`0%`
+   * - libcxx/include/__compare
+     - `13`
+     - `1`
+     - `12`
+     - :part:`7%`
    * - libcxx/include/__concepts
      - `22`
      - `0`
@@ -2350,24 +2550,29 @@
      - `13`
      - :part:`18%`
    * - libcxx/include/__format
+     - `17`
+     - `2`
      - `15`
-     - `0`
-     - `15`
-     - :none:`0%`
+     - :part:`11%`
    * - libcxx/include/__functional
      - `27`
      - `0`
      - `27`
      - :none:`0%`
+   * - libcxx/include/__ios
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
    * - libcxx/include/__iterator
-     - `32`
+     - `36`
      - `0`
-     - `32`
+     - `36`
      - :none:`0%`
    * - libcxx/include/__memory
-     - `18`
+     - `19`
      - `1`
-     - `17`
+     - `18`
      - :part:`5%`
    * - libcxx/include/__numeric
      - `13`
@@ -2375,15 +2580,15 @@
      - `9`
      - :part:`30%`
    * - libcxx/include/__random
-     - `36`
-     - `0`
-     - `36`
-     - :none:`0%`
+     - `37`
+     - `2`
+     - `35`
+     - :part:`5%`
    * - libcxx/include/__ranges
-     - `25`
-     - `0`
-     - `25`
-     - :none:`0%`
+     - `29`
+     - `2`
+     - `27`
+     - :part:`6%`
    * - libcxx/include/__support/android
      - `1`
      - `0`
@@ -2430,25 +2635,25 @@
      - `3`
      - :none:`0%`
    * - libcxx/include/__thread
-     - `1`
+     - `2`
      - `0`
-     - `1`
+     - `2`
      - :none:`0%`
    * - libcxx/include/__utility
-     - `16`
-     - `0`
-     - `16`
-     - :none:`0%`
+     - `17`
+     - `5`
+     - `12`
+     - :part:`29%`
    * - libcxx/include/__variant
      - `1`
      - `0`
      - `1`
      - :none:`0%`
    * - libcxx/src
-     - `41`
-     - `5`
+     - `42`
+     - `6`
      - `36`
-     - :part:`12%`
+     - :part:`14%`
    * - libcxx/src/experimental
      - `2`
      - `1`
@@ -2530,10 +2735,10 @@
      - `24`
      - :part:`35%`
    * - lld/Common
-     - `10`
-     - `8`
+     - `11`
+     - `9`
      - `2`
-     - :part:`80%`
+     - :part:`81%`
    * - lld/ELF
      - `48`
      - `25`
@@ -2545,18 +2750,18 @@
      - `10`
      - :part:`28%`
    * - lld/include/lld/Common
-     - `13`
+     - `14`
+     - `8`
      - `6`
-     - `7`
-     - :part:`46%`
+     - :part:`57%`
    * - lld/include/lld/Core
      - `20`
      - `4`
      - `16`
      - :part:`20%`
    * - lld/MachO
+     - `45`
      - `43`
-     - `41`
      - `2`
      - :part:`95%`
    * - lld/MachO/Arch
@@ -2631,9 +2836,9 @@
      - :part:`36%`
    * - lldb/include/lldb/Core
      - `61`
-     - `32`
-     - `29`
-     - :part:`52%`
+     - `31`
+     - `30`
+     - :part:`50%`
    * - lldb/include/lldb/DataFormatters
      - `18`
      - `10`
@@ -2710,30 +2915,30 @@
      - `21`
      - :part:`40%`
    * - lldb/include/lldb/Target
-     - `77`
-     - `50`
+     - `78`
+     - `51`
      - `27`
-     - :part:`64%`
+     - :part:`65%`
    * - lldb/include/lldb/Utility
      - `63`
-     - `40`
-     - `23`
-     - :part:`63%`
+     - `41`
+     - `22`
+     - :part:`65%`
    * - lldb/include/lldb/Version
      - `1`
      - `1`
      - `0`
      - :good:`100%`
    * - lldb/source/API
-     - `74`
-     - `6`
-     - `68`
-     - :part:`8%`
+     - `73`
+     - `36`
+     - `37`
+     - :part:`49%`
    * - lldb/source/Breakpoint
      - `24`
-     - `5`
-     - `19`
-     - :part:`20%`
+     - `6`
+     - `18`
+     - :part:`25%`
    * - lldb/source/Commands
      - `70`
      - `57`
@@ -2741,9 +2946,9 @@
      - :part:`81%`
    * - lldb/source/Core
      - `49`
-     - `25`
-     - `24`
-     - :part:`51%`
+     - `26`
+     - `23`
+     - :part:`53%`
    * - lldb/source/DataFormatters
      - `16`
      - `3`
@@ -2771,9 +2976,9 @@
      - :good:`100%`
    * - lldb/source/Host/linux
      - `5`
-     - `4`
-     - `1`
-     - :part:`80%`
+     - `5`
+     - `0`
+     - :good:`100%`
    * - lldb/source/Host/macosx/cfcpp
      - `14`
      - `12`
@@ -2796,9 +3001,9 @@
      - :part:`50%`
    * - lldb/source/Host/posix
      - `9`
-     - `5`
-     - `4`
-     - :part:`55%`
+     - `6`
+     - `3`
+     - :part:`66%`
    * - lldb/source/Host/windows
      - `11`
      - `7`
@@ -3006,9 +3211,9 @@
      - :none:`0%`
    * - lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime
      - `16`
-     - `6`
-     - `10`
-     - :part:`37%`
+     - `5`
+     - `11`
+     - :part:`31%`
    * - lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime
      - `8`
      - `3`
@@ -3146,9 +3351,9 @@
      - :part:`80%`
    * - lldb/source/Plugins/Process/gdb-remote
      - `26`
-     - `14`
-     - `12`
-     - :part:`53%`
+     - `15`
+     - `11`
+     - :part:`57%`
    * - lldb/source/Plugins/Process/Linux
      - `21`
      - `11`
@@ -3176,9 +3381,9 @@
      - :part:`50%`
    * - lldb/source/Plugins/Process/POSIX
      - `8`
-     - `5`
-     - `3`
-     - :part:`62%`
+     - `7`
+     - `1`
+     - :part:`87%`
    * - lldb/source/Plugins/Process/scripted
      - `4`
      - `4`
@@ -3246,9 +3451,9 @@
      - :none:`0%`
    * - lldb/source/Plugins/SymbolFile/DWARF
      - `65`
-     - `38`
-     - `27`
-     - :part:`58%`
+     - `39`
+     - `26`
+     - :part:`60%`
    * - lldb/source/Plugins/SymbolFile/NativePDB
      - `20`
      - `10`
@@ -3325,15 +3530,15 @@
      - `13`
      - :part:`58%`
    * - lldb/source/Target
-     - `68`
-     - `33`
+     - `69`
+     - `34`
      - `35`
-     - :part:`48%`
+     - :part:`49%`
    * - lldb/source/Utility
      - `58`
-     - `45`
-     - `13`
-     - :part:`77%`
+     - `46`
+     - `12`
+     - :part:`79%`
    * - lldb/source/Version
      - `1`
      - `1`
@@ -3411,9 +3616,9 @@
      - :part:`40%`
    * - lldb/tools/lldb-vscode
      - `27`
-     - `25`
-     - `2`
-     - :part:`92%`
+     - `24`
+     - `3`
+     - :part:`88%`
    * - lldb/unittests
      - `1`
      - `1`
@@ -3460,10 +3665,10 @@
      - `2`
      - :part:`60%`
    * - lldb/unittests/Host
-     - `15`
-     - `10`
+     - `16`
+     - `11`
      - `5`
-     - :part:`66%`
+     - :part:`68%`
    * - lldb/unittests/Host/linux
      - `2`
      - `2`
@@ -3480,10 +3685,10 @@
      - `1`
      - :none:`0%`
    * - lldb/unittests/Interpreter
-     - `5`
-     - `1`
+     - `6`
+     - `2`
      - `4`
-     - :part:`20%`
+     - :part:`33%`
    * - lldb/unittests/Language/CLanguages
      - `1`
      - `1`
@@ -3520,10 +3725,10 @@
      - `1`
      - :none:`0%`
    * - lldb/unittests/Platform
+     - `3`
      - `2`
      - `1`
-     - `1`
-     - :part:`50%`
+     - :part:`66%`
    * - lldb/unittests/Platform/Android
      - `1`
      - `0`
@@ -3610,10 +3815,10 @@
      - `0`
      - :good:`100%`
    * - lldb/unittests/Target
-     - `9`
-     - `5`
+     - `10`
+     - `6`
      - `4`
-     - :part:`55%`
+     - :part:`60%`
    * - lldb/unittests/TestingSupport
      - `5`
      - `4`
@@ -3640,9 +3845,9 @@
      - `2`
      - :none:`0%`
    * - lldb/unittests/tools/lldb-server/tests
-     - `8`
+     - `7`
      - `0`
-     - `8`
+     - `7`
      - :none:`0%`
    * - lldb/unittests/UnwindAssembly/ARM64
      - `1`
@@ -3660,10 +3865,10 @@
      - `1`
      - :none:`0%`
    * - lldb/unittests/Utility
-     - `46`
+     - `45`
      - `32`
-     - `14`
-     - :part:`69%`
+     - `13`
+     - :part:`71%`
    * - lldb/utils/lit-cpuid
      - `1`
      - `0`
@@ -3895,15 +4100,15 @@
      - `6`
      - :part:`25%`
    * - llvm/include/llvm/ADT
-     - `91`
-     - `26`
-     - `65`
-     - :part:`28%`
+     - `93`
+     - `25`
+     - `68`
+     - :part:`26%`
    * - llvm/include/llvm/Analysis
-     - `129`
-     - `51`
+     - `130`
+     - `52`
      - `78`
-     - :part:`39%`
+     - :part:`40%`
    * - llvm/include/llvm/Analysis/Utils
      - `3`
      - `1`
@@ -3915,30 +4120,30 @@
      - `3`
      - :part:`40%`
    * - llvm/include/llvm/BinaryFormat
-     - `14`
+     - `15`
      - `8`
-     - `6`
-     - :part:`57%`
+     - `7`
+     - :part:`53%`
    * - llvm/include/llvm/Bitcode
      - `7`
-     - `3`
-     - `4`
-     - :part:`42%`
+     - `2`
+     - `5`
+     - :part:`28%`
    * - llvm/include/llvm/Bitstream
      - `3`
      - `0`
      - `3`
      - :none:`0%`
    * - llvm/include/llvm/CodeGen
-     - `156`
-     - `50`
-     - `106`
+     - `158`
+     - `51`
+     - `107`
      - :part:`32%`
    * - llvm/include/llvm/CodeGen/GlobalISel
-     - `29`
-     - `9`
-     - `20`
-     - :part:`31%`
+     - `27`
+     - `8`
+     - `19`
+     - :part:`29%`
    * - llvm/include/llvm/CodeGen/MIRParser
      - `2`
      - `1`
@@ -3990,13 +4195,13 @@
      - `19`
      - :part:`64%`
    * - llvm/include/llvm/DebugInfo/Symbolize
+     - `5`
      - `3`
      - `2`
-     - `1`
-     - :part:`66%`
+     - :part:`60%`
    * - llvm/include/llvm/Debuginfod
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - llvm/include/llvm/Demangle
@@ -4021,19 +4226,19 @@
      - :part:`16%`
    * - llvm/include/llvm/ExecutionEngine/JITLink
      - `16`
-     - `13`
-     - `3`
-     - :part:`81%`
+     - `14`
+     - `2`
+     - :part:`87%`
    * - llvm/include/llvm/ExecutionEngine/Orc
      - `38`
-     - `28`
-     - `10`
-     - :part:`73%`
+     - `29`
+     - `9`
+     - :part:`76%`
    * - llvm/include/llvm/ExecutionEngine/Orc/Shared
-     - `7`
+     - `8`
      - `4`
-     - `3`
-     - :part:`57%`
+     - `4`
+     - :part:`50%`
    * - llvm/include/llvm/ExecutionEngine/Orc/TargetProcess
      - `7`
      - `7`
@@ -4060,9 +4265,9 @@
      - `0`
      - :good:`100%`
    * - llvm/include/llvm/IR
-     - `92`
+     - `93`
      - `28`
-     - `64`
+     - `65`
      - :part:`30%`
    * - llvm/include/llvm/IRReader
      - `1`
@@ -4091,9 +4296,9 @@
      - :none:`0%`
    * - llvm/include/llvm/MC
      - `74`
-     - `23`
-     - `51`
-     - :part:`31%`
+     - `24`
+     - `50`
+     - :part:`32%`
    * - llvm/include/llvm/MC/MCDisassembler
      - `4`
      - `1`
@@ -4119,6 +4324,36 @@
      - `8`
      - `0`
      - :good:`100%`
+   * - llvm/include/llvm/ObjCopy
+     - `4`
+     - `3`
+     - `1`
+     - :part:`75%`
+   * - llvm/include/llvm/ObjCopy/COFF
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/ObjCopy/ELF
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/ObjCopy/MachO
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/ObjCopy/wasm
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/ObjCopy/XCOFF
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - llvm/include/llvm/Object
      - `31`
      - `12`
@@ -4140,10 +4375,10 @@
      - `2`
      - :part:`50%`
    * - llvm/include/llvm/ProfileData
-     - `10`
-     - `5`
+     - `11`
      - `5`
-     - :part:`50%`
+     - `6`
+     - :part:`45%`
    * - llvm/include/llvm/ProfileData/Coverage
      - `3`
      - `2`
@@ -4155,9 +4390,9 @@
      - `1`
      - :part:`91%`
    * - llvm/include/llvm/Support
-     - `182`
-     - `67`
-     - `115`
+     - `186`
+     - `68`
+     - `118`
      - :part:`36%`
    * - llvm/include/llvm/Support/FileSystem
      - `1`
@@ -4175,10 +4410,10 @@
      - `1`
      - :none:`0%`
    * - llvm/include/llvm/TableGen
-     - `8`
-     - `2`
+     - `9`
+     - `3`
      - `6`
-     - :part:`25%`
+     - :part:`33%`
    * - llvm/include/llvm/Target
      - `6`
      - `2`
@@ -4231,24 +4466,29 @@
      - :part:`58%`
    * - llvm/include/llvm/Transforms/IPO
      - `38`
-     - `27`
-     - `11`
-     - :part:`71%`
+     - `28`
+     - `10`
+     - :part:`73%`
    * - llvm/include/llvm/Transforms/Scalar
      - `75`
      - `47`
      - `28`
      - :part:`62%`
    * - llvm/include/llvm/Transforms/Utils
-     - `73`
-     - `43`
+     - `74`
+     - `44`
      - `30`
-     - :part:`58%`
+     - :part:`59%`
    * - llvm/include/llvm/Transforms/Vectorize
      - `5`
      - `1`
      - `4`
      - :part:`20%`
+   * - llvm/include/llvm/WindowsDriver
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
    * - llvm/include/llvm/WindowsManifest
      - `1`
      - `1`
@@ -4285,10 +4525,10 @@
      - `2`
      - :part:`33%`
    * - llvm/lib/BinaryFormat
-     - `12`
-     - `9`
+     - `13`
+     - `10`
      - `3`
-     - :part:`75%`
+     - :part:`76%`
    * - llvm/lib/Bitcode/Reader
      - `7`
      - `2`
@@ -4305,25 +4545,25 @@
      - `1`
      - :none:`0%`
    * - llvm/lib/CodeGen
-     - `215`
-     - `57`
-     - `158`
-     - :part:`26%`
+     - `220`
+     - `60`
+     - `160`
+     - :part:`27%`
    * - llvm/lib/CodeGen/AsmPrinter
      - `45`
      - `18`
      - `27`
      - :part:`40%`
    * - llvm/lib/CodeGen/GlobalISel
-     - `26`
+     - `24`
      - `9`
-     - `17`
-     - :part:`34%`
+     - `15`
+     - :part:`37%`
    * - llvm/lib/CodeGen/LiveDebugValues
      - `5`
-     - `2`
-     - `3`
-     - :part:`40%`
+     - `1`
+     - `4`
+     - :part:`20%`
    * - llvm/lib/CodeGen/MIRParser
      - `4`
      - `1`
@@ -4356,9 +4596,9 @@
      - :part:`75%`
    * - llvm/lib/DebugInfo/PDB
      - `40`
-     - `34`
-     - `6`
-     - :part:`85%`
+     - `35`
+     - `5`
+     - :part:`87%`
    * - llvm/lib/DebugInfo/PDB/DIA
      - `18`
      - `15`
@@ -4371,12 +4611,12 @@
      - :part:`74%`
    * - llvm/lib/DebugInfo/Symbolize
      - `4`
-     - `2`
-     - `2`
-     - :part:`50%`
+     - `3`
+     - `1`
+     - :part:`75%`
    * - llvm/lib/Debuginfod
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - llvm/lib/Demangle
@@ -4411,9 +4651,9 @@
      - :none:`0%`
    * - llvm/lib/ExecutionEngine/JITLink
      - `23`
-     - `17`
-     - `6`
-     - :part:`73%`
+     - `15`
+     - `8`
+     - :part:`65%`
    * - llvm/lib/ExecutionEngine/MCJIT
      - `2`
      - `0`
@@ -4430,8 +4670,8 @@
      - `15`
      - :part:`59%`
    * - llvm/lib/ExecutionEngine/Orc/Shared
-     - `3`
-     - `3`
+     - `4`
+     - `4`
      - `0`
      - :good:`100%`
    * - llvm/lib/ExecutionEngine/Orc/TargetProcess
@@ -4539,11 +4779,41 @@
      - `7`
      - `1`
      - :part:`87%`
+   * - llvm/lib/ObjCopy
+     - `4`
+     - `3`
+     - `1`
+     - :part:`75%`
+   * - llvm/lib/ObjCopy/COFF
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/ObjCopy/ELF
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/ObjCopy/MachO
+     - `9`
+     - `9`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/ObjCopy/wasm
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/ObjCopy/XCOFF
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
    * - llvm/lib/Object
      - `31`
-     - `15`
      - `16`
-     - :part:`48%`
+     - `15`
+     - :part:`51%`
    * - llvm/lib/ObjectYAML
      - `23`
      - `9`
@@ -4560,10 +4830,10 @@
      - `3`
      - :part:`50%`
    * - llvm/lib/ProfileData
-     - `10`
+     - `11`
      - `4`
-     - `6`
-     - :part:`40%`
+     - `7`
+     - :part:`36%`
    * - llvm/lib/ProfileData/Coverage
      - `3`
      - `0`
@@ -4575,30 +4845,30 @@
      - `3`
      - :part:`76%`
    * - llvm/lib/Support
-     - `141`
-     - `58`
+     - `144`
+     - `61`
      - `83`
-     - :part:`41%`
+     - :part:`42%`
    * - llvm/lib/Support/Unix
      - `1`
      - `0`
      - `1`
      - :none:`0%`
    * - llvm/lib/TableGen
-     - `13`
-     - `1`
+     - `15`
+     - `3`
      - `12`
-     - :part:`7%`
+     - :part:`20%`
    * - llvm/lib/Target
      - `5`
-     - `0`
-     - `5`
-     - :none:`0%`
+     - `1`
+     - `4`
+     - :part:`20%`
    * - llvm/lib/Target/AArch64
      - `60`
-     - `6`
-     - `54`
-     - :part:`10%`
+     - `7`
+     - `53`
+     - :part:`11%`
    * - llvm/lib/Target/AArch64/AsmParser
      - `1`
      - `0`
@@ -4631,9 +4901,9 @@
      - :none:`0%`
    * - llvm/lib/Target/AMDGPU
      - `169`
-     - `39`
-     - `130`
-     - :part:`23%`
+     - `38`
+     - `131`
+     - :part:`22%`
    * - llvm/lib/Target/AMDGPU/AsmParser
      - `1`
      - `0`
@@ -4651,9 +4921,9 @@
      - :good:`100%`
    * - llvm/lib/Target/AMDGPU/MCTargetDesc
      - `21`
-     - `6`
-     - `15`
-     - :part:`28%`
+     - `5`
+     - `16`
+     - :part:`23%`
    * - llvm/lib/Target/AMDGPU/TargetInfo
      - `2`
      - `1`
@@ -4716,9 +4986,9 @@
      - :none:`0%`
    * - llvm/lib/Target/AVR
      - `24`
-     - `24`
-     - `0`
-     - :good:`100%`
+     - `23`
+     - `1`
+     - :part:`95%`
    * - llvm/lib/Target/AVR/AsmParser
      - `1`
      - `1`
@@ -4731,10 +5001,10 @@
      - :good:`100%`
    * - llvm/lib/Target/AVR/MCTargetDesc
      - `20`
-     - `20`
-     - `0`
-     - :good:`100%`
-   * - llvm/lib/Target/AVR/TargetInfo
+     - `18`
+     - `2`
+     - :part:`90%`
+   * - llvm/lib/Target/AVR/TargetInfo
      - `2`
      - `2`
      - `0`
@@ -4765,8 +5035,8 @@
      - `1`
      - :part:`50%`
    * - llvm/lib/Target/CSKY
-     - `20`
-     - `20`
+     - `23`
+     - `23`
      - `0`
      - :good:`100%`
    * - llvm/lib/Target/CSKY/AsmParser
@@ -4774,11 +5044,16 @@
      - `1`
      - `0`
      - :good:`100%`
-   * - llvm/lib/Target/CSKY/MCTargetDesc
-     - `15`
-     - `15`
+   * - llvm/lib/Target/CSKY/Disassembler
+     - `1`
+     - `1`
      - `0`
      - :good:`100%`
+   * - llvm/lib/Target/CSKY/MCTargetDesc
+     - `15`
+     - `14`
+     - `1`
+     - :part:`93%`
    * - llvm/lib/Target/CSKY/TargetInfo
      - `2`
      - `2`
@@ -4834,6 +5109,21 @@
      - `2`
      - `0`
      - :good:`100%`
+   * - llvm/lib/Target/LoongArch
+     - `19`
+     - `19`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/LoongArch/MCTargetDesc
+     - `12`
+     - `12`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/LoongArch/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - llvm/lib/Target/M68k
      - `26`
      - `25`
@@ -4856,9 +5146,9 @@
      - :part:`85%`
    * - llvm/lib/Target/M68k/MCTargetDesc
      - `12`
-     - `12`
-     - `0`
-     - :good:`100%`
+     - `11`
+     - `1`
+     - :part:`91%`
    * - llvm/lib/Target/M68k/TargetInfo
      - `2`
      - `2`
@@ -4930,10 +5220,10 @@
      - `0`
      - :good:`100%`
    * - llvm/lib/Target/PowerPC
-     - `53`
-     - `4`
+     - `54`
+     - `5`
      - `49`
-     - :part:`7%`
+     - :part:`9%`
    * - llvm/lib/Target/PowerPC/AsmParser
      - `1`
      - `0`
@@ -4960,10 +5250,10 @@
      - `0`
      - :good:`100%`
    * - llvm/lib/Target/RISCV
-     - `34`
-     - `18`
-     - `16`
-     - :part:`52%`
+     - `36`
+     - `17`
+     - `19`
+     - :part:`47%`
    * - llvm/lib/Target/RISCV/AsmParser
      - `1`
      - `0`
@@ -5035,10 +5325,10 @@
      - `0`
      - :good:`100%`
    * - llvm/lib/Target/VE
-     - `21`
-     - `17`
-     - `4`
-     - :part:`80%`
+     - `24`
+     - `19`
+     - `5`
+     - :part:`79%`
    * - llvm/lib/Target/VE/AsmParser
      - `1`
      - `1`
@@ -5091,9 +5381,9 @@
      - :good:`100%`
    * - llvm/lib/Target/X86
      - `82`
-     - `18`
-     - `64`
-     - :part:`21%`
+     - `19`
+     - `63`
+     - :part:`23%`
    * - llvm/lib/Target/X86/AsmParser
      - `3`
      - `0`
@@ -5104,6 +5394,11 @@
      - `0`
      - `2`
      - :none:`0%`
+   * - llvm/lib/Target/X86/MCA
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - llvm/lib/Target/X86/MCTargetDesc
      - `25`
      - `5`
@@ -5141,9 +5436,9 @@
      - :good:`100%`
    * - llvm/lib/TextAPI
      - `11`
-     - `11`
-     - `0`
-     - :good:`100%`
+     - `9`
+     - `2`
+     - :part:`81%`
    * - llvm/lib/ToolDrivers/llvm-dlltool
      - `1`
      - `0`
@@ -5180,15 +5475,15 @@
      - `15`
      - :part:`6%`
    * - llvm/lib/Transforms/Instrumentation
-     - `22`
-     - `8`
+     - `21`
+     - `7`
      - `14`
-     - :part:`36%`
+     - :part:`33%`
    * - llvm/lib/Transforms/IPO
      - `44`
-     - `10`
-     - `34`
-     - :part:`22%`
+     - `9`
+     - `35`
+     - :part:`20%`
    * - llvm/lib/Transforms/ObjCARC
      - `15`
      - `4`
@@ -5200,15 +5495,20 @@
      - `63`
      - :part:`20%`
    * - llvm/lib/Transforms/Utils
-     - `77`
-     - `18`
+     - `78`
+     - `19`
      - `59`
-     - :part:`23%`
+     - :part:`24%`
    * - llvm/lib/Transforms/Vectorize
      - `22`
      - `13`
      - `9`
      - :part:`59%`
+   * - llvm/lib/WindowsDriver
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - llvm/lib/WindowsManifest
      - `1`
      - `1`
@@ -5311,9 +5611,9 @@
      - :none:`0%`
    * - llvm/tools/llvm-cxxdump
      - `4`
-     - `2`
-     - `2`
-     - :part:`50%`
+     - `1`
+     - `3`
+     - :part:`25%`
    * - llvm/tools/llvm-cxxfilt
      - `1`
      - `1`
@@ -5344,6 +5644,11 @@
      - `0`
      - `1`
      - :none:`0%`
+   * - llvm/tools/llvm-dis-fuzzer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - llvm/tools/llvm-dlang-demangle-fuzzer
      - `2`
      - `2`
@@ -5510,30 +5815,10 @@
      - `1`
      - :none:`0%`
    * - llvm/tools/llvm-objcopy
-     - `6`
-     - `5`
-     - `1`
-     - :part:`83%`
-   * - llvm/tools/llvm-objcopy/COFF
-     - `9`
-     - `9`
-     - `0`
-     - :good:`100%`
-   * - llvm/tools/llvm-objcopy/ELF
-     - `5`
-     - `2`
      - `3`
-     - :part:`40%`
-   * - llvm/tools/llvm-objcopy/MachO
-     - `11`
-     - `11`
-     - `0`
-     - :good:`100%`
-   * - llvm/tools/llvm-objcopy/wasm
-     - `9`
-     - `9`
-     - `0`
-     - :good:`100%`
+     - `2`
+     - `1`
+     - :part:`66%`
    * - llvm/tools/llvm-objdump
      - `15`
      - `10`
@@ -5561,9 +5846,9 @@
      - :none:`0%`
    * - llvm/tools/llvm-profgen
      - `11`
-     - `7`
-     - `4`
-     - :part:`63%`
+     - `6`
+     - `5`
+     - :part:`54%`
    * - llvm/tools/llvm-rc
      - `12`
      - `6`
@@ -5576,14 +5861,19 @@
      - :part:`15%`
    * - llvm/tools/llvm-reduce
      - `7`
-     - `7`
-     - `0`
-     - :good:`100%`
+     - `6`
+     - `1`
+     - :part:`85%`
    * - llvm/tools/llvm-reduce/deltas
      - `40`
      - `39`
      - `1`
      - :part:`97%`
+   * - llvm/tools/llvm-remark-size-diff
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - llvm/tools/llvm-rtdyld
      - `1`
      - `0`
@@ -5616,9 +5906,9 @@
      - :good:`100%`
    * - llvm/tools/llvm-split
      - `1`
-     - `1`
      - `0`
-     - :good:`100%`
+     - `1`
+     - :none:`0%`
    * - llvm/tools/llvm-stress
      - `1`
      - `0`
@@ -5715,10 +6005,10 @@
      - `0`
      - :good:`100%`
    * - llvm/unittests/ADT
-     - `78`
-     - `32`
-     - `46`
-     - :part:`41%`
+     - `77`
+     - `29`
+     - `48`
+     - :part:`37%`
    * - llvm/unittests/Analysis
      - `38`
      - `13`
@@ -5745,10 +6035,10 @@
      - `1`
      - :part:`50%`
    * - llvm/unittests/CodeGen
-     - `19`
-     - `9`
+     - `20`
      - `10`
-     - :part:`47%`
+     - `10`
+     - :part:`50%`
    * - llvm/unittests/CodeGen/GlobalISel
      - `13`
      - `2`
@@ -5761,9 +6051,9 @@
      - :part:`50%`
    * - llvm/unittests/DebugInfo/DWARF
      - `17`
-     - `12`
-     - `5`
-     - :part:`70%`
+     - `13`
+     - `4`
+     - :part:`76%`
    * - llvm/unittests/DebugInfo/GSYM
      - `1`
      - `0`
@@ -5821,9 +6111,9 @@
      - :none:`0%`
    * - llvm/unittests/Frontend
      - `4`
-     - `4`
-     - `0`
-     - :good:`100%`
+     - `3`
+     - `1`
+     - :part:`75%`
    * - llvm/unittests/FuzzMutate
      - `4`
      - `0`
@@ -5836,9 +6126,9 @@
      - :good:`100%`
    * - llvm/unittests/IR
      - `36`
-     - `5`
-     - `31`
-     - :part:`13%`
+     - `6`
+     - `30`
+     - :part:`16%`
    * - llvm/unittests/LineEditor
      - `1`
      - `0`
@@ -5874,6 +6164,11 @@
      - `0`
      - `1`
      - :none:`0%`
+   * - llvm/unittests/ObjCopy
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - llvm/unittests/Object
      - `9`
      - `6`
@@ -5895,20 +6190,20 @@
      - `0`
      - :good:`100%`
    * - llvm/unittests/ProfileData
-     - `4`
-     - `1`
+     - `5`
+     - `2`
      - `3`
-     - :part:`25%`
+     - :part:`40%`
    * - llvm/unittests/Remarks
      - `8`
      - `5`
      - `3`
      - :part:`62%`
    * - llvm/unittests/Support
-     - `98`
-     - `33`
+     - `100`
+     - `35`
      - `65`
-     - :part:`33%`
+     - :part:`35%`
    * - llvm/unittests/Support/CommandLineInit
      - `1`
      - `1`
@@ -5920,10 +6215,10 @@
      - `4`
      - :none:`0%`
    * - llvm/unittests/TableGen
+     - `3`
+     - `1`
      - `2`
-     - `0`
-     - `2`
-     - :none:`0%`
+     - :part:`33%`
    * - llvm/unittests/Target/AArch64
      - `3`
      - `1`
@@ -5935,10 +6230,10 @@
      - `0`
      - :good:`100%`
    * - llvm/unittests/Target/ARM
+     - `2`
      - `1`
-     - `0`
      - `1`
-     - :none:`0%`
+     - :part:`50%`
    * - llvm/unittests/Target/PowerPC
      - `1`
      - `1`
@@ -5954,6 +6249,11 @@
      - `0`
      - `1`
      - :none:`0%`
+   * - llvm/unittests/Testing/Support
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - llvm/unittests/TextAPI
      - `5`
      - `3`
@@ -6055,15 +6355,15 @@
      - `1`
      - :none:`0%`
    * - llvm/utils/TableGen
-     - `76`
-     - `11`
+     - `78`
+     - `13`
      - `65`
-     - :part:`14%`
+     - :part:`16%`
    * - llvm/utils/TableGen/GlobalISel
      - `17`
-     - `8`
-     - `9`
-     - :part:`47%`
+     - `10`
+     - `7`
+     - :part:`58%`
    * - llvm/utils/unittest/googlemock/include/gmock
      - `12`
      - `0`
@@ -6285,18 +6585,18 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Analysis
-     - `14`
-     - `12`
+     - `7`
+     - `5`
      - `2`
-     - :part:`85%`
+     - :part:`71%`
    * - mlir/include/mlir/Analysis/AliasAnalysis
      - `1`
      - `1`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Analysis/Presburger
-     - `6`
-     - `6`
+     - `9`
+     - `9`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Bindings/Python
@@ -6354,6 +6654,21 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - mlir/include/mlir/Conversion/ControlFlowToLLVM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/ControlFlowToSPIRV
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/FuncToSPIRV
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Conversion/GPUCommon
      - `1`
      - `1`
@@ -6361,9 +6676,9 @@
      - :good:`100%`
    * - mlir/include/mlir/Conversion/GPUToNVVM
      - `1`
-     - `0`
      - `1`
-     - :none:`0%`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Conversion/GPUToROCDL
      - `2`
      - `2`
@@ -6449,6 +6764,11 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - mlir/include/mlir/Conversion/SCFToControlFlow
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Conversion/SCFToGPU
      - `2`
      - `2`
@@ -6464,11 +6784,6 @@
      - `2`
      - `0`
      - :good:`100%`
-   * - mlir/include/mlir/Conversion/SCFToStandard
-     - `1`
-     - `1`
-     - `0`
-     - :good:`100%`
    * - mlir/include/mlir/Conversion/ShapeToStandard
      - `1`
      - `1`
@@ -6484,7 +6799,7 @@
      - `2`
      - `0`
      - :good:`100%`
-   * - mlir/include/mlir/Conversion/StandardToSPIRV
+   * - mlir/include/mlir/Conversion/TensorToSPIRV
      - `2`
      - `2`
      - `0`
@@ -6535,8 +6850,13 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Affine
-     - `2`
-     - `2`
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Affine/Analysis
+     - `5`
+     - `5`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Affine/IR
@@ -6555,6 +6875,11 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Arithmetic/Transforms
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Arithmetic/Utils
      - `1`
      - `1`
      - `0`
@@ -6580,13 +6905,13 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Bufferization/IR
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Bufferization/Transforms
-     - `2`
-     - `2`
+     - `4`
+     - `4`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Complex/IR
@@ -6594,6 +6919,11 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - mlir/include/mlir/Dialect/ControlFlow/IR
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Dialect/DLTI
      - `2`
      - `2`
@@ -6604,6 +6934,16 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - mlir/include/mlir/Dialect/Func/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Func/Transforms
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Dialect/GPU
      - `5`
      - `5`
@@ -6620,18 +6960,18 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize
-     - `10`
-     - `9`
-     - `1`
-     - :part:`90%`
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Dialect/Linalg/IR
      - `2`
      - `2`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Linalg/Transforms
-     - `4`
-     - `4`
+     - `5`
+     - `5`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Linalg/Utils
@@ -6665,8 +7005,8 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/MemRef/Transforms
-     - `1`
-     - `1`
+     - `2`
+     - `2`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/MemRef/Utils
@@ -6700,10 +7040,15 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/SCF
-     - `5`
      - `4`
-     - `1`
-     - :part:`80%`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/SCF/Utils
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Dialect/Shape/IR
      - `1`
      - `1`
@@ -6719,6 +7064,11 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - mlir/include/mlir/Dialect/SparseTensor/Pipelines
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Dialect/SparseTensor/Transforms
      - `1`
      - `1`
@@ -6749,27 +7099,17 @@
      - `1`
      - `0`
      - :good:`100%`
-   * - mlir/include/mlir/Dialect/StandardOps/IR
-     - `1`
-     - `1`
-     - `0`
-     - :good:`100%`
-   * - mlir/include/mlir/Dialect/StandardOps/Transforms
-     - `4`
-     - `4`
-     - `0`
-     - :good:`100%`
-   * - mlir/include/mlir/Dialect/StandardOps/Utils
-     - `1`
-     - `1`
-     - `0`
-     - :good:`100%`
    * - mlir/include/mlir/Dialect/Tensor/IR
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Tensor/Transforms
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Tensor/Utils
      - `1`
      - `1`
      - `0`
@@ -6785,20 +7125,30 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Tosa/Utils
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Dialect/Utils
-     - `3`
-     - `3`
+     - `4`
+     - `4`
      - `0`
      - :good:`100%`
-   * - mlir/include/mlir/Dialect/Vector
+   * - mlir/include/mlir/Dialect/Vector/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Vector/Transforms
      - `4`
      - `4`
      - `0`
      - :good:`100%`
+   * - mlir/include/mlir/Dialect/Vector/Utils
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Dialect/X86Vector
      - `2`
      - `2`
@@ -6806,19 +7156,19 @@
      - :good:`100%`
    * - mlir/include/mlir/ExecutionEngine
      - `8`
-     - `6`
-     - `2`
-     - :part:`75%`
+     - `7`
+     - `1`
+     - :part:`87%`
    * - mlir/include/mlir/Interfaces
      - `14`
      - `13`
      - `1`
      - :part:`92%`
    * - mlir/include/mlir/IR
-     - `50`
+     - `49`
      - `29`
-     - `21`
-     - :part:`57%`
+     - `20`
+     - :part:`59%`
    * - mlir/include/mlir/Parser
      - `1`
      - `1`
@@ -6929,16 +7279,26 @@
      - `2`
      - `2`
      - :part:`50%`
+   * - mlir/include/mlir/Tools/PDLL/CodeGen
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Tools/PDLL/ODS
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
    * - mlir/include/mlir/Tools/PDLL/Parser
      - `1`
      - `1`
      - `0`
      - :good:`100%`
    * - mlir/include/mlir/Transforms
-     - `12`
-     - `10`
+     - `9`
+     - `7`
      - `2`
-     - :part:`83%`
+     - :part:`77%`
    * - mlir/include/mlir-c
      - `15`
      - `15`
@@ -6950,13 +7310,13 @@
      - `0`
      - :good:`100%`
    * - mlir/include/mlir-c/Dialect
-     - `9`
-     - `9`
+     - `11`
+     - `11`
      - `0`
      - :good:`100%`
    * - mlir/lib/Analysis
-     - `14`
-     - `14`
+     - `7`
+     - `7`
      - `0`
      - :good:`100%`
    * - mlir/lib/Analysis/AliasAnalysis
@@ -6965,15 +7325,15 @@
      - `0`
      - :good:`100%`
    * - mlir/lib/Analysis/Presburger
-     - `5`
-     - `5`
+     - `8`
+     - `8`
      - `0`
      - :good:`100%`
    * - mlir/lib/Bindings/Python
-     - `22`
-     - `21`
-     - `1`
-     - :part:`95%`
+     - `23`
+     - `23`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Bindings/Python/Conversions
      - `1`
      - `1`
@@ -6995,8 +7355,8 @@
      - `0`
      - :good:`100%`
    * - mlir/lib/CAPI/Dialect
-     - `13`
-     - `13`
+     - `15`
+     - `15`
      - `0`
      - :good:`100%`
    * - mlir/lib/CAPI/ExecutionEngine
@@ -7069,11 +7429,26 @@
      - `1`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Conversion/GPUCommon
-     - `5`
-     - `5`
+   * - mlir/lib/Conversion/ControlFlowToLLVM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/ControlFlowToSPIRV
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/FuncToSPIRV
+     - `2`
+     - `2`
      - `0`
      - :good:`100%`
+   * - mlir/lib/Conversion/GPUCommon
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
    * - mlir/lib/Conversion/GPUToNVVM
      - `2`
      - `2`
@@ -7106,9 +7481,9 @@
      - :part:`50%`
    * - mlir/lib/Conversion/LinalgToStandard
      - `1`
-     - `1`
      - `0`
-     - :good:`100%`
+     - `1`
+     - :none:`0%`
    * - mlir/lib/Conversion/LLVMCommon
      - `8`
      - `8`
@@ -7164,6 +7539,11 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - mlir/lib/Conversion/SCFToControlFlow
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Conversion/SCFToGPU
      - `2`
      - `2`
@@ -7179,11 +7559,6 @@
      - `2`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Conversion/SCFToStandard
-     - `1`
-     - `1`
-     - `0`
-     - :good:`100%`
    * - mlir/lib/Conversion/ShapeToStandard
      - `2`
      - `2`
@@ -7204,7 +7579,7 @@
      - `1`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Conversion/StandardToSPIRV
+   * - mlir/lib/Conversion/TensorToSPIRV
      - `2`
      - `2`
      - `0`
@@ -7226,9 +7601,9 @@
      - :good:`100%`
    * - mlir/lib/Conversion/VectorToGPU
      - `1`
-     - `1`
      - `0`
-     - :good:`100%`
+     - `1`
+     - :none:`0%`
    * - mlir/lib/Conversion/VectorToLLVM
      - `2`
      - `2`
@@ -7254,19 +7629,24 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - mlir/lib/Dialect/Affine/Analysis
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Dialect/Affine/IR
      - `3`
      - `2`
      - `1`
      - :part:`66%`
    * - mlir/lib/Dialect/Affine/Transforms
-     - `11`
-     - `11`
+     - `14`
+     - `14`
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/Affine/Utils
-     - `1`
-     - `1`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/AMX/IR
@@ -7281,12 +7661,17 @@
      - :good:`100%`
    * - mlir/lib/Dialect/Arithmetic/IR
      - `2`
-     - `2`
-     - `0`
-     - :good:`100%`
+     - `1`
+     - `1`
+     - :part:`50%`
    * - mlir/lib/Dialect/Arithmetic/Transforms
+     - `4`
      - `3`
-     - `3`
+     - `1`
+     - :part:`75%`
+   * - mlir/lib/Dialect/Arithmetic/Utils
+     - `1`
+     - `1`
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/ArmNeon/IR
@@ -7315,13 +7700,13 @@
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/Bufferization/IR
-     - `3`
-     - `3`
+     - `4`
+     - `4`
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/Bufferization/Transforms
-     - `3`
-     - `3`
+     - `7`
+     - `7`
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/Complex/IR
@@ -7329,6 +7714,11 @@
      - `2`
      - `0`
      - :good:`100%`
+   * - mlir/lib/Dialect/ControlFlow/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Dialect/DLTI
      - `2`
      - `2`
@@ -7339,6 +7729,16 @@
      - `1`
      - `0`
      - :good:`100%`
+   * - mlir/lib/Dialect/Func/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Func/Transforms
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Dialect/GPU/IR
      - `1`
      - `1`
@@ -7355,18 +7755,18 @@
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/Linalg/ComprehensiveBufferize
-     - `10`
-     - `9`
-     - `1`
-     - :part:`90%`
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Dialect/Linalg/IR
      - `3`
      - `3`
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/Linalg/Transforms
-     - `23`
-     - `23`
+     - `25`
+     - `25`
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/Linalg/Utils
@@ -7376,9 +7776,9 @@
      - :good:`100%`
    * - mlir/lib/Dialect/LLVMIR/IR
      - `7`
-     - `7`
-     - `0`
-     - :good:`100%`
+     - `5`
+     - `2`
+     - :part:`71%`
    * - mlir/lib/Dialect/LLVMIR/Transforms
      - `2`
      - `2`
@@ -7400,10 +7800,10 @@
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/MemRef/Transforms
-     - `2`
-     - `2`
-     - `0`
-     - :good:`100%`
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
    * - mlir/lib/Dialect/MemRef/Utils
      - `1`
      - `1`
@@ -7454,6 +7854,11 @@
      - `11`
      - `1`
      - :part:`91%`
+   * - mlir/lib/Dialect/SCF/Utils
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Dialect/Shape/IR
      - `1`
      - `1`
@@ -7469,11 +7874,16 @@
      - `1`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Dialect/SparseTensor/Transforms
-     - `3`
-     - `3`
+   * - mlir/lib/Dialect/SparseTensor/Pipelines
+     - `1`
+     - `1`
      - `0`
      - :good:`100%`
+   * - mlir/lib/Dialect/SparseTensor/Transforms
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
    * - mlir/lib/Dialect/SparseTensor/Utils
      - `1`
      - `1`
@@ -7490,65 +7900,65 @@
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/SPIRV/Transforms
+     - `7`
      - `6`
-     - `5`
      - `1`
-     - :part:`83%`
+     - :part:`85%`
    * - mlir/lib/Dialect/SPIRV/Utils
      - `1`
      - `1`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Dialect/StandardOps/IR
-     - `1`
-     - `1`
+   * - mlir/lib/Dialect/Tensor/IR
+     - `4`
+     - `4`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Dialect/StandardOps/Transforms
-     - `8`
-     - `8`
+   * - mlir/lib/Dialect/Tensor/Transforms
+     - `4`
+     - `4`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Dialect/StandardOps/Utils
+   * - mlir/lib/Dialect/Tensor/Utils
      - `1`
      - `1`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Dialect/Tensor/IR
-     - `3`
-     - `3`
-     - `0`
-     - :good:`100%`
-   * - mlir/lib/Dialect/Tensor/Transforms
-     - `2`
-     - `2`
-     - `0`
-     - :good:`100%`
    * - mlir/lib/Dialect/Tosa/IR
      - `1`
      - `1`
      - `0`
      - :good:`100%`
    * - mlir/lib/Dialect/Tosa/Transforms
+     - `6`
+     - `6`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Tosa/Utils
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Utils
      - `4`
      - `4`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Dialect/Tosa/Utils
+   * - mlir/lib/Dialect/Vector/IR
      - `1`
+     - `0`
      - `1`
+     - :none:`0%`
+   * - mlir/lib/Dialect/Vector/Transforms
+     - `11`
+     - `11`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Dialect/Utils
-     - `3`
-     - `3`
+   * - mlir/lib/Dialect/Vector/Utils
+     - `1`
+     - `1`
      - `0`
      - :good:`100%`
-   * - mlir/lib/Dialect/Vector
-     - `9`
-     - `8`
-     - `1`
-     - :part:`88%`
    * - mlir/lib/Dialect/X86Vector/IR
      - `1`
      - `1`
@@ -7561,24 +7971,24 @@
      - :good:`100%`
    * - mlir/lib/ExecutionEngine
      - `9`
-     - `8`
-     - `1`
-     - :part:`88%`
+     - `9`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Interfaces
      - `12`
      - `12`
      - `0`
      - :good:`100%`
    * - mlir/lib/IR
-     - `37`
-     - `34`
-     - `3`
-     - :part:`91%`
+     - `38`
+     - `31`
+     - `7`
+     - :part:`81%`
    * - mlir/lib/Parser
      - `14`
-     - `14`
-     - `0`
-     - :good:`100%`
+     - `10`
+     - `4`
+     - :part:`71%`
    * - mlir/lib/Pass
      - `8`
      - `6`
@@ -7611,9 +8021,9 @@
      - :good:`100%`
    * - mlir/lib/Target/LLVMIR
      - `7`
-     - `7`
-     - `0`
-     - :good:`100%`
+     - `6`
+     - `1`
+     - :part:`85%`
    * - mlir/lib/Target/LLVMIR/Dialect/AMX
      - `1`
      - `1`
@@ -7676,9 +8086,9 @@
      - :part:`75%`
    * - mlir/lib/Tools/mlir-lsp-server
      - `5`
-     - `5`
-     - `0`
-     - :good:`100%`
+     - `4`
+     - `1`
+     - :part:`80%`
    * - mlir/lib/Tools/mlir-lsp-server/lsp
      - `6`
      - `4`
@@ -7694,19 +8104,29 @@
      - `5`
      - `1`
      - :part:`83%`
+   * - mlir/lib/Tools/PDLL/CodeGen
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - mlir/lib/Tools/PDLL/ODS
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
    * - mlir/lib/Tools/PDLL/Parser
      - `3`
      - `1`
      - `2`
      - :part:`33%`
    * - mlir/lib/Transforms
-     - `19`
-     - `16`
-     - `3`
+     - `13`
+     - `11`
+     - `2`
      - :part:`84%`
    * - mlir/lib/Transforms/Utils
-     - `8`
-     - `8`
+     - `6`
+     - `6`
      - `0`
      - :good:`100%`
    * - mlir/lib/Translation
@@ -7721,9 +8141,9 @@
      - :good:`100%`
    * - mlir/tools/mlir-linalg-ods-gen
      - `1`
-     - `0`
      - `1`
-     - :none:`0%`
+     - `0`
+     - :good:`100%`
    * - mlir/tools/mlir-lsp-server
      - `1`
      - `1`
@@ -7769,14 +8189,9 @@
      - `4`
      - `0`
      - :good:`100%`
-   * - mlir/unittests/Analysis
-     - `5`
-     - `5`
-     - `0`
-     - :good:`100%`
    * - mlir/unittests/Analysis/Presburger
-     - `4`
-     - `4`
+     - `8`
+     - `8`
      - `0`
      - :good:`100%`
    * - mlir/unittests/Conversion/PDLToPDLInterp
@@ -7789,12 +8204,12 @@
      - `1`
      - `0`
      - :good:`100%`
-   * - mlir/unittests/Dialect/Quant
-     - `1`
-     - `1`
+   * - mlir/unittests/Dialect/Affine/Analysis
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
-   * - mlir/unittests/Dialect/SCF
+   * - mlir/unittests/Dialect/Quant
      - `1`
      - `1`
      - `0`
@@ -7830,8 +8245,8 @@
      - `0`
      - :good:`100%`
    * - mlir/unittests/Pass
-     - `2`
-     - `2`
+     - `3`
+     - `3`
      - `0`
      - :good:`100%`
    * - mlir/unittests/Rewrite
@@ -7866,49 +8281,14 @@
      - :good:`100%`
    * - openmp/libomptarget/DeviceRTL/src
      - `12`
-     - `10`
-     - `2`
-     - :part:`83%`
-   * - openmp/libomptarget/deviceRTLs
-     - `2`
-     - `2`
-     - `0`
-     - :good:`100%`
-   * - openmp/libomptarget/deviceRTLs/amdgcn/src
-     - `2`
-     - `2`
-     - `0`
-     - :good:`100%`
-   * - openmp/libomptarget/deviceRTLs/common
-     - `7`
-     - `6`
-     - `1`
-     - :part:`85%`
-   * - openmp/libomptarget/deviceRTLs/common/include
-     - `1`
-     - `1`
-     - `0`
-     - :good:`100%`
-   * - openmp/libomptarget/deviceRTLs/common/include/target
-     - `1`
-     - `1`
-     - `0`
-     - :good:`100%`
-   * - openmp/libomptarget/deviceRTLs/common/src
-     - `1`
-     - `1`
-     - `0`
-     - :good:`100%`
-   * - openmp/libomptarget/deviceRTLs/nvptx/src
-     - `2`
-     - `2`
-     - `0`
-     - :good:`100%`
+     - `9`
+     - `3`
+     - :part:`75%`
    * - openmp/libomptarget/include
+     - `9`
      - `8`
-     - `8`
-     - `0`
-     - :good:`100%`
+     - `1`
+     - :part:`88%`
    * - openmp/libomptarget/plugins/amdgpu/dynamic_hsa
      - `3`
      - `2`
@@ -7941,9 +8321,9 @@
      - :good:`100%`
    * - openmp/libomptarget/plugins/cuda/src
      - `1`
-     - `1`
      - `0`
-     - :good:`100%`
+     - `1`
+     - :none:`0%`
    * - openmp/libomptarget/plugins/generic-elf-64bit/src
      - `1`
      - `1`
@@ -7956,9 +8336,9 @@
      - :good:`100%`
    * - openmp/libomptarget/plugins/remote/lib
      - `1`
-     - `1`
      - `0`
-     - :good:`100%`
+     - `1`
+     - :none:`0%`
    * - openmp/libomptarget/plugins/remote/server
      - `3`
      - `3`
@@ -7975,10 +8355,10 @@
      - `0`
      - :good:`100%`
    * - openmp/libomptarget/src
+     - `7`
      - `6`
-     - `4`
-     - `2`
-     - :part:`66%`
+     - `1`
+     - :part:`85%`
    * - openmp/libomptarget/tools/deviceinfo
      - `1`
      - `1`
@@ -7991,9 +8371,9 @@
      - :good:`100%`
    * - openmp/runtime/src
      - `75`
-     - `66`
-     - `9`
-     - :part:`88%`
+     - `65`
+     - `10`
+     - :part:`86%`
    * - openmp/runtime/src/thirdparty/ittnotify
      - `6`
      - `5`
@@ -8151,9 +8531,9 @@
      - :good:`100%`
    * - pstl/include/pstl/internal
      - `23`
-     - `12`
-     - `11`
-     - :part:`52%`
+     - `16`
+     - `7`
+     - :part:`69%`
    * - pstl/include/pstl/internal/omp
      - `11`
      - `8`
@@ -8185,7 +8565,7 @@
      - `1`
      - :part:`50%`
    * - Total
-     - :total:`15902`
-     - :total:`8407`
-     - :total:`7495`
-     - :total:`52%`
+     - :total:`16432`
+     - :total:`8857`
+     - :total:`7575`
+     - :total:`53%`
diff --git a/clang/docs/DataFlowAnalysisIntro.md b/clang/docs/DataFlowAnalysisIntro.md
--- a/clang/docs/DataFlowAnalysisIntro.md
+++ b/clang/docs/DataFlowAnalysisIntro.md
@@ -287,7 +287,7 @@
 
 (Note that there are other ways to write this equation that produce higher
 precision analysis results. The trick is to keep exploring the execution paths
-separately and delay joining until later. Hoowever, we won't discuss those
+separately and delay joining until later. However, we won't discuss those
 variations here.)
 
 To make a conclusion about all paths through the program, we repeat this
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -2358,7 +2358,7 @@
  ``_IO_getc``, ``fdopen``, ``fopen``, ``freopen``, ``get_current_dir_name``, ``getch``, ``getchar``, ``getchar_unlocked``, ``getwd``, ``getcwd``, ``getgroups``, ``gethostname``, ``getlogin``, ``getlogin_r``, ``getnameinfo``, ``gets``, ``gets_s``, ``getseuserbyname``, ``readlink``, ``readlinkat``, ``scanf``, ``scanf_s``, ``socket``, ``wgetch``
 
 Default propagations defined by ``GenericTaintChecker``:
-``atoi``, ``atol``, ``atoll``, ``fgetc``, ``fgetln``, ``fgets``, ``fscanf``, ``sscanf``, ``getc``, ``getc_unlocked``, ``getdelim``, ``getline``, ``getw``, ``pread``, ``read``, ``strchr``, ``strrchr``, ``tolower``, ``toupper``
+``atoi``, ``atol``, ``atoll``, ``basename``, ``dirname``, ``fgetc``, ``fgetln``, ``fgets``, ``fnmatch``, ``fread``, ``fscanf``, ``fscanf_s``, ``index``, ``inflate``, ``isalnum``, ``isalpha``, ``isascii``, ``isblank``, ``iscntrl``, ``isdigit``, ``isgraph``, ``islower``, ``isprint``, ``ispunct``, ``isspace``, ``isupper``, ``isxdigit``, ``memchr``, ``memrchr``, ``sscanf``, ``getc``, ``getc_unlocked``, ``getdelim``, ``getline``, ``getw``, ``memcmp``, ``memcpy``, ``memmem``, ``memmove``, ``mbtowc``, ``pread``, ``qsort``, ``qsort_r``, ``rawmemchr``, ``read``, ``recv``, ``recvfrom``, ``rindex``, ``strcasestr``, ``strchr``, ``strchrnul``, ``strcasecmp``, ``strcmp``, ``strcspn``, ``strlen``, ``strncasecmp``, ``strncmp``, ``strndup``, ``strndupa``, ``strnlen``, ``strpbrk``, ``strrchr``, ``strsep``, ``strspn``, ``strstr``, ``strtol``, ``strtoll``, ``strtoul``, ``strtoull``, ``tolower``, ``toupper``, ``ttyname``, ``ttyname_r``, ``wctomb``, ``wcwidth``
 
 Default sinks defined in ``GenericTaintChecker``:
 ``printf``, ``setproctitle``, ``system``, ``popen``, ``execl``, ``execle``, ``execlp``, ``execv``, ``execvp``, ``execvP``, ``execve``, ``dlopen``, ``memcpy``, ``memmove``, ``strncpy``, ``strndup``, ``malloc``, ``calloc``, ``alloca``, ``memccpy``, ``realloc``, ``bcopy``
diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -1,3 +1,115 @@
+bolt/include/bolt/Core/BinaryData.h
+bolt/include/bolt/Core/BinaryEmitter.h
+bolt/include/bolt/Core/BinaryLoop.h
+bolt/include/bolt/Core/BinarySection.h
+bolt/include/bolt/Core/DebugData.h
+bolt/include/bolt/Core/Exceptions.h
+bolt/include/bolt/Core/JumpTable.h
+bolt/include/bolt/Core/MCPlus.h
+bolt/include/bolt/Core/MCPlusBuilder.h
+bolt/include/bolt/Core/ParallelUtilities.h
+bolt/include/bolt/Passes/ADRRelaxationPass.h
+bolt/include/bolt/Passes/Aligner.h
+bolt/include/bolt/Passes/AllocCombiner.h
+bolt/include/bolt/Passes/AsmDump.h
+bolt/include/bolt/Passes/BinaryFunctionCallGraph.h
+bolt/include/bolt/Passes/BinaryPasses.h
+bolt/include/bolt/Passes/CacheMetrics.h
+bolt/include/bolt/Passes/CallGraph.h
+bolt/include/bolt/Passes/CallGraphWalker.h
+bolt/include/bolt/Passes/DataflowAnalysis.h
+bolt/include/bolt/Passes/DataflowInfoManager.h
+bolt/include/bolt/Passes/DominatorAnalysis.h
+bolt/include/bolt/Passes/FrameAnalysis.h
+bolt/include/bolt/Passes/FrameOptimizer.h
+bolt/include/bolt/Passes/HFSort.h
+bolt/include/bolt/Passes/IdenticalCodeFolding.h
+bolt/include/bolt/Passes/IndirectCallPromotion.h
+bolt/include/bolt/Passes/Inliner.h
+bolt/include/bolt/Passes/Instrumentation.h
+bolt/include/bolt/Passes/InstrumentationSummary.h
+bolt/include/bolt/Passes/JTFootprintReduction.h
+bolt/include/bolt/Passes/LivenessAnalysis.h
+bolt/include/bolt/Passes/LongJmp.h
+bolt/include/bolt/Passes/LoopInversionPass.h
+bolt/include/bolt/Passes/MCF.h
+bolt/include/bolt/Passes/PatchEntries.h
+bolt/include/bolt/Passes/PLTCall.h
+bolt/include/bolt/Passes/ReachingDefOrUse.h
+bolt/include/bolt/Passes/ReachingInsns.h
+bolt/include/bolt/Passes/RegAnalysis.h
+bolt/include/bolt/Passes/RegReAssign.h
+bolt/include/bolt/Passes/ReorderAlgorithm.h
+bolt/include/bolt/Passes/ReorderData.h
+bolt/include/bolt/Passes/ReorderFunctions.h
+bolt/include/bolt/Passes/ReorderUtils.h
+bolt/include/bolt/Passes/RetpolineInsertion.h
+bolt/include/bolt/Passes/ShrinkWrapping.h
+bolt/include/bolt/Passes/SplitFunctions.h
+bolt/include/bolt/Passes/StackAllocationAnalysis.h
+bolt/include/bolt/Passes/StackAvailableExpressions.h
+bolt/include/bolt/Passes/StackPointerTracking.h
+bolt/include/bolt/Passes/StackReachingUses.h
+bolt/include/bolt/Passes/StokeInfo.h
+bolt/include/bolt/Passes/TailDuplication.h
+bolt/include/bolt/Passes/ThreeWayBranch.h
+bolt/include/bolt/Passes/ValidateInternalCalls.h
+bolt/include/bolt/Passes/VeneerElimination.h
+bolt/include/bolt/Profile/BoltAddressTranslation.h
+bolt/include/bolt/Profile/DataAggregator.h
+bolt/include/bolt/Profile/DataReader.h
+bolt/include/bolt/Profile/Heatmap.h
+bolt/include/bolt/Profile/ProfileReaderBase.h
+bolt/include/bolt/Profile/ProfileYAMLMapping.h
+bolt/include/bolt/Profile/YAMLProfileReader.h
+bolt/include/bolt/Profile/YAMLProfileWriter.h
+bolt/include/bolt/Rewrite/BinaryPassManager.h
+bolt/include/bolt/Rewrite/DWARFRewriter.h
+bolt/include/bolt/Rewrite/ExecutableFileMemoryManager.h
+bolt/include/bolt/Rewrite/MachORewriteInstance.h
+bolt/include/bolt/RuntimeLibs/HugifyRuntimeLibrary.h
+bolt/include/bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h
+bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h
+bolt/include/bolt/Utils/CommandLineOpts.h
+bolt/include/bolt/Utils/NameResolver.h
+bolt/include/bolt/Utils/NameShortener.h
+bolt/include/bolt/Utils/Utils.h
+bolt/lib/Core/BinaryBasicBlock.cpp
+bolt/lib/Core/BinarySection.cpp
+bolt/lib/Core/DebugData.cpp
+bolt/lib/Core/JumpTable.cpp
+bolt/lib/Core/MCPlusBuilder.cpp
+bolt/lib/Passes/ADRRelaxationPass.cpp
+bolt/lib/Passes/AllocCombiner.cpp
+bolt/lib/Passes/AsmDump.cpp
+bolt/lib/Passes/BinaryFunctionCallGraph.cpp
+bolt/lib/Passes/CacheMetrics.cpp
+bolt/lib/Passes/CallGraphWalker.cpp
+bolt/lib/Passes/DataflowAnalysis.cpp
+bolt/lib/Passes/DataflowInfoManager.cpp
+bolt/lib/Passes/HFSort.cpp
+bolt/lib/Passes/IndirectCallPromotion.cpp
+bolt/lib/Passes/Instrumentation.cpp
+bolt/lib/Passes/JTFootprintReduction.cpp
+bolt/lib/Passes/LivenessAnalysis.cpp
+bolt/lib/Passes/LoopInversionPass.cpp
+bolt/lib/Passes/PettisAndHansen.cpp
+bolt/lib/Passes/StackAllocationAnalysis.cpp
+bolt/lib/Passes/StackPointerTracking.cpp
+bolt/lib/Passes/StackReachingUses.cpp
+bolt/lib/Passes/TailDuplication.cpp
+bolt/lib/Passes/ThreeWayBranch.cpp
+bolt/lib/Passes/ValidateInternalCalls.cpp
+bolt/lib/Profile/BoltAddressTranslation.cpp
+bolt/lib/Profile/Heatmap.cpp
+bolt/lib/Profile/ProfileReaderBase.cpp
+bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp
+bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
+bolt/lib/RuntimeLibs/RuntimeLibrary.cpp
+bolt/lib/Utils/Utils.cpp
+bolt/tools/heatmap/heatmap.cpp
+bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp
+bolt/unittests/Core/MCPlusBuilder.cpp
 clang/bindings/python/tests/cindex/INPUTS/header1.h
 clang/bindings/python/tests/cindex/INPUTS/header2.h
 clang/bindings/python/tests/cindex/INPUTS/header3.h
@@ -13,10 +125,19 @@
 clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h
 clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
 clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
 clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
 clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h
 clang/include/clang/Analysis/FlowSensitive/DataflowWorklist.h
+clang/include/clang/Analysis/FlowSensitive/MapLattice.h
+clang/include/clang/Analysis/FlowSensitive/MatchSwitch.h
+clang/include/clang/Analysis/FlowSensitive/Solver.h
+clang/include/clang/Analysis/FlowSensitive/SourceLocationsLattice.h
+clang/include/clang/Analysis/FlowSensitive/StorageLocation.h
+clang/include/clang/Analysis/FlowSensitive/Transfer.h
 clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
+clang/include/clang/Analysis/FlowSensitive/Value.h
+clang/include/clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h
 clang/include/clang/APINotes/APINotesYAMLCompiler.h
 clang/include/clang/APINotes/Types.h
 clang/include/clang/AST/AST.h
@@ -138,10 +259,12 @@
 clang/include/clang/Tooling/ASTDiff/ASTDiffInternal.h
 clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
 clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
+clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
 clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
 clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
 clang/include/clang/Tooling/Inclusions/HeaderIncludes.h
 clang/include/clang/Tooling/Inclusions/IncludeStyle.h
+clang/include/clang/Tooling/Inclusions/StandardLibrary.h
 clang/include/clang/Tooling/Refactoring/ASTSelection.h
 clang/include/clang/Tooling/Refactoring/AtomicChange.h
 clang/include/clang/Tooling/Refactoring/Lookup.h
@@ -166,6 +289,11 @@
 clang/include/clang/Tooling/Syntax/Nodes.h
 clang/include/clang/Tooling/Syntax/Tokens.h
 clang/include/clang/Tooling/Syntax/Tree.h
+clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
+clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
+clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
+clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
+clang/include/clang/Tooling/Syntax/Pseudo/Token.h
 clang/include/clang/Tooling/Transformer/MatchConsumer.h
 clang/include/clang/Tooling/Transformer/Parsing.h
 clang/include/clang/Tooling/Transformer/RangeSelector.h
@@ -178,9 +306,13 @@
 clang/lib/Analysis/CalledOnceCheck.cpp
 clang/lib/Analysis/CloneDetection.cpp
 clang/lib/Analysis/CodeInjector.cpp
-clang/lib/Analysis/ExprMutationAnalyzer.cpp
 clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
+clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+clang/lib/Analysis/FlowSensitive/SourceLocationsLattice.cpp
+clang/lib/Analysis/FlowSensitive/Transfer.cpp
 clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+clang/lib/Analysis/FlowSensitive/WatchedLiteralsSolver.cpp
 clang/lib/Analysis/plugins/CheckerDependencyHandling/CheckerDependencyHandling.cpp
 clang/lib/Analysis/plugins/SampleAnalyzer/MainCallChecker.cpp
 clang/lib/APINotes/APINotesFormat.h
@@ -246,7 +378,6 @@
 clang/lib/Basic/Targets/WebAssembly.cpp
 clang/lib/Basic/Targets/WebAssembly.h
 clang/lib/Basic/Targets/XCore.cpp
-clang/lib/CodeGen/CGCall.h
 clang/lib/CodeGen/CGCUDARuntime.cpp
 clang/lib/CodeGen/CGLoopInfo.cpp
 clang/lib/CodeGen/CGLoopInfo.h
@@ -262,6 +393,7 @@
 clang/lib/Driver/XRayArgs.cpp
 clang/lib/Driver/ToolChains/AIX.cpp
 clang/lib/Driver/ToolChains/AIX.h
+clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
 clang/lib/Driver/ToolChains/Ananas.h
 clang/lib/Driver/ToolChains/AVR.cpp
@@ -313,6 +445,8 @@
 clang/lib/Format/BreakableToken.h
 clang/lib/Format/ContinuationIndenter.cpp
 clang/lib/Format/ContinuationIndenter.h
+clang/lib/Format/DefinitionBlockSeparator.cpp
+clang/lib/Format/DefinitionBlockSeparator.h
 clang/lib/Format/Encoding.h
 clang/lib/Format/Format.cpp
 clang/lib/Format/FormatInternal.h
@@ -340,6 +474,7 @@
 clang/lib/Format/UsingDeclarationsSorter.h
 clang/lib/Format/WhitespaceManager.cpp
 clang/lib/Format/WhitespaceManager.h
+clang/lib/Frontend/ExtractAPIConsumer.cpp
 clang/lib/Frontend/FrontendOptions.cpp
 clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
 clang/lib/Frontend/SerializedDiagnosticReader.cpp
@@ -352,18 +487,15 @@
 clang/lib/Headers/nmmintrin.h
 clang/lib/Headers/s390intrin.h
 clang/lib/Headers/stdalign.h
-clang/lib/Headers/stdnoreturn.h
 clang/lib/Headers/wmmintrin.h
 clang/lib/Headers/xtestintrin.h
 clang/lib/Headers/__clang_cuda_texture_intrinsics.h
-clang/lib/Headers/__clang_hip_cmath.h
 clang/lib/Headers/__clang_hip_libdevice_declares.h
 clang/lib/Headers/__stddef_max_align_t.h
 clang/lib/Headers/openmp_wrappers/complex.h
 clang/lib/Headers/openmp_wrappers/complex_cmath.h
 clang/lib/Headers/openmp_wrappers/math.h
 clang/lib/Headers/openmp_wrappers/time.h
-clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
 clang/lib/Headers/ppc_wrappers/mmintrin.h
 clang/lib/Headers/ppc_wrappers/smmintrin.h
 clang/lib/Index/FileIndexRecord.cpp
@@ -378,13 +510,15 @@
 clang/lib/Parse/ParseOpenMP.cpp
 clang/lib/Sema/CodeCompleteConsumer.cpp
 clang/lib/Sema/CoroutineStmtBuilder.h
-clang/lib/Sema/SemaOpenMP.cpp
 clang/lib/Sema/SemaSYCL.cpp
 clang/lib/Sema/UsedDeclVisitor.h
 clang/lib/Serialization/InMemoryModuleCache.cpp
 clang/lib/Serialization/ModuleFileExtension.cpp
 clang/lib/StaticAnalyzer/Checkers/AllocationState.h
 clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp
+clang/lib/StaticAnalyzer/Checkers/ErrnoModeling.cpp
+clang/lib/StaticAnalyzer/Checkers/ErrnoModeling.h
+clang/lib/StaticAnalyzer/Checkers/ErrnoTesterChecker.cpp
 clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
 clang/lib/StaticAnalyzer/Checkers/FuchsiaHandleChecker.cpp
 clang/lib/StaticAnalyzer/Checkers/InterCheckerAPI.h
@@ -433,12 +567,14 @@
 clang/lib/Tooling/StandaloneExecution.cpp
 clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
 clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp
+clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
 clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
 clang/lib/Tooling/DumpTool/APIData.h
 clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.h
 clang/lib/Tooling/DumpTool/ClangSrcLocDump.cpp
 clang/lib/Tooling/Inclusions/HeaderIncludes.cpp
 clang/lib/Tooling/Inclusions/IncludeStyle.cpp
+clang/lib/Tooling/Inclusions/StandardLibrary.cpp
 clang/lib/Tooling/Refactoring/ASTSelection.cpp
 clang/lib/Tooling/Refactoring/Lookup.cpp
 clang/lib/Tooling/Refactoring/RefactoringActions.cpp
@@ -451,6 +587,14 @@
 clang/lib/Tooling/Syntax/Nodes.cpp
 clang/lib/Tooling/Syntax/Synthesis.cpp
 clang/lib/Tooling/Syntax/Tree.cpp
+clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
+clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
+clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
+clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
+clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
+clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
+clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
+clang/lib/Tooling/Syntax/Pseudo/Token.cpp
 clang/lib/Tooling/Transformer/Parsing.cpp
 clang/lib/Tooling/Transformer/SourceCodeBuilders.cpp
 clang/lib/Tooling/Transformer/Stencil.cpp
@@ -463,8 +607,11 @@
 clang/tools/clang-fuzzer/ExampleClangLLVMProtoFuzzer.cpp
 clang/tools/clang-fuzzer/ExampleClangLoopProtoFuzzer.cpp
 clang/tools/clang-fuzzer/handle-llvm/handle_llvm.h
+clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
 clang/tools/clang-nvlink-wrapper/ClangNvlinkWrapper.cpp
 clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
+clang/tools/clang-pseudo/ClangPseudo.cpp
 clang/tools/clang-refactor/ClangRefactor.cpp
 clang/tools/clang-refactor/TestSupport.cpp
 clang/tools/clang-refactor/TestSupport.h
@@ -482,7 +629,15 @@
 clang/tools/scan-build-py/tests/functional/src/include/clean-one.h
 clang/unittests/Analysis/CFGBuildResult.h
 clang/unittests/Analysis/MacroExpansionContextTest.cpp
+clang/unittests/Analysis/FlowSensitive/DataflowAnalysisContextTest.cpp
+clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
+clang/unittests/Analysis/FlowSensitive/MapLatticeTest.cpp
+clang/unittests/Analysis/FlowSensitive/MatchSwitchTest.cpp
+clang/unittests/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp
+clang/unittests/Analysis/FlowSensitive/NoopAnalysis.h
 clang/unittests/Analysis/FlowSensitive/SingleVarConstantPropagationTest.cpp
+clang/unittests/Analysis/FlowSensitive/SolverTest.cpp
+clang/unittests/Analysis/FlowSensitive/SourceLocationsLatticeTest.cpp
 clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
 clang/unittests/Analysis/FlowSensitive/TestingSupport.h
 clang/unittests/Analysis/FlowSensitive/TestingSupportTest.cpp
@@ -506,6 +661,7 @@
 clang/unittests/CrossTU/CrossTranslationUnitTest.cpp
 clang/unittests/Driver/SanitizerArgsTest.cpp
 clang/unittests/Format/CleanupTest.cpp
+clang/unittests/Format/DefinitionBlockSeparatorTest.cpp
 clang/unittests/Format/FormatTest.cpp
 clang/unittests/Format/FormatTestComments.cpp
 clang/unittests/Format/FormatTestCSharp.cpp
@@ -541,6 +697,7 @@
 clang/unittests/Lex/HeaderMapTest.cpp
 clang/unittests/Lex/HeaderMapTestUtils.h
 clang/unittests/Lex/HeaderSearchTest.cpp
+clang/unittests/Lex/PPMemoryAllocationsTest.cpp
 clang/unittests/libclang/CrashTests/LibclangCrashTest.cpp
 clang/unittests/Rewrite/RewriterTest.cpp
 clang/unittests/Sema/CodeCompleteTest.cpp
@@ -554,6 +711,7 @@
 clang/unittests/StaticAnalyzer/StoreTest.cpp
 clang/unittests/StaticAnalyzer/SValTest.cpp
 clang/unittests/StaticAnalyzer/SymbolReaperTest.cpp
+clang/unittests/Tooling/CastExprTest.cpp
 clang/unittests/Tooling/DependencyScannerTest.cpp
 clang/unittests/Tooling/ExecutionTest.cpp
 clang/unittests/Tooling/LookupTest.cpp
@@ -561,6 +719,7 @@
 clang/unittests/Tooling/RefactoringActionRulesTest.cpp
 clang/unittests/Tooling/ReplacementTest.h
 clang/unittests/Tooling/SourceCodeBuildersTest.cpp
+clang/unittests/Tooling/StandardLibraryTest.cpp
 clang/unittests/Tooling/StencilTest.cpp
 clang/unittests/Tooling/RecursiveASTVisitorTests/CallbacksCallExpr.cpp
 clang/unittests/Tooling/RecursiveASTVisitorTests/CallbacksLeaf.cpp
@@ -577,6 +736,10 @@
 clang/unittests/Tooling/Syntax/TokensTest.cpp
 clang/unittests/Tooling/Syntax/TreeTestBase.cpp
 clang/unittests/Tooling/Syntax/TreeTestBase.h
+clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
+clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp
+clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
+clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp
 clang/utils/TableGen/ClangDataCollectorsEmitter.cpp
 clang/utils/TableGen/ClangSyntaxEmitter.cpp
 clang/utils/TableGen/TableGenBackends.h
@@ -641,6 +804,8 @@
 clang-tools-extra/clang-tidy/ClangTidyProfiling.h
 clang-tools-extra/clang-tidy/GlobList.cpp
 clang-tools-extra/clang-tidy/GlobList.h
+clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp
+clang-tools-extra/clang-tidy/NoLintDirectiveHandler.h
 clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h
 clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp
 clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h
@@ -709,6 +874,7 @@
 clang-tools-extra/clang-tidy/boost/UseToStringCheck.h
 clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
 clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.h
+clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
 clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h
 clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp
 clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h
@@ -760,6 +926,8 @@
 clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h
 clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp
 clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h
+clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.cpp
+clang-tools-extra/clang-tidy/bugprone/SharedPtrArrayMismatchCheck.h
 clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
 clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h
 clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
@@ -767,6 +935,8 @@
 clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.cpp
 clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h
 clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
+clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
+clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.h
 clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp
 clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
 clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h
@@ -849,6 +1019,7 @@
 clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h
 clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.cpp
 clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h
+clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp
 clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h
 clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp
 clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.h
@@ -944,6 +1115,8 @@
 clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp
 clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h
 clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
+clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp
+clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h
 clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp
 clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h
 clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp
@@ -1013,7 +1186,6 @@
 clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
 clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
 clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.h
-clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
 clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h
 clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp
 clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -1072,6 +1244,8 @@
 clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
 clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.h
 clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
+clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp
+clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h
 clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp
 clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.h
 clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
@@ -1079,6 +1253,8 @@
 clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h
 clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.cpp
 clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h
+clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
+clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h
 clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h
 clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
 clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h
@@ -1122,7 +1298,9 @@
 clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.h
 clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp
 clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h
+clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
 clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
+clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprMatchers.h
 clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
 clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp
 clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h
@@ -1230,7 +1408,6 @@
 clang-tools-extra/clangd/IncludeCleaner.cpp
 clang-tools-extra/clangd/IncludeCleaner.h
 clang-tools-extra/clangd/IncludeFixer.cpp
-clang-tools-extra/clangd/InlayHints.cpp
 clang-tools-extra/clangd/InlayHints.h
 clang-tools-extra/clangd/LSPBinder.h
 clang-tools-extra/clangd/ParsedAST.cpp
@@ -1243,7 +1420,6 @@
 clang-tools-extra/clangd/Quality.cpp
 clang-tools-extra/clangd/RIFF.cpp
 clang-tools-extra/clangd/RIFF.h
-clang-tools-extra/clangd/Selection.cpp
 clang-tools-extra/clangd/Selection.h
 clang-tools-extra/clangd/SemanticHighlighting.h
 clang-tools-extra/clangd/SemanticSelection.cpp
@@ -1290,14 +1466,12 @@
 clang-tools-extra/clangd/index/Symbol.cpp
 clang-tools-extra/clangd/index/Symbol.h
 clang-tools-extra/clangd/index/SymbolCollector.cpp
-clang-tools-extra/clangd/index/SymbolCollector.h
 clang-tools-extra/clangd/index/SymbolID.cpp
 clang-tools-extra/clangd/index/SymbolLocation.cpp
 clang-tools-extra/clangd/index/SymbolLocation.h
 clang-tools-extra/clangd/index/SymbolOrigin.cpp
 clang-tools-extra/clangd/index/SymbolOrigin.h
 clang-tools-extra/clangd/index/YAMLSerialization.cpp
-clang-tools-extra/clangd/index/dex/Dex.h
 clang-tools-extra/clangd/index/dex/Iterator.cpp
 clang-tools-extra/clangd/index/dex/Iterator.h
 clang-tools-extra/clangd/index/dex/PostingList.cpp
@@ -1314,6 +1488,8 @@
 clang-tools-extra/clangd/index/remote/server/Server.cpp
 clang-tools-extra/clangd/index/remote/unimplemented/UnimplementedClient.cpp
 clang-tools-extra/clangd/indexer/IndexerMain.cpp
+clang-tools-extra/clangd/refactor/InsertionPoint.cpp
+clang-tools-extra/clangd/refactor/InsertionPoint.h
 clang-tools-extra/clangd/refactor/Rename.h
 clang-tools-extra/clangd/refactor/Tweak.cpp
 clang-tools-extra/clangd/refactor/Tweak.h
@@ -1362,6 +1538,7 @@
 clang-tools-extra/clangd/unittests/CanonicalIncludesTests.cpp
 clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
 clang-tools-extra/clangd/unittests/ClangdTests.cpp
+clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
 clang-tools-extra/clangd/unittests/CodeCompletionStringsTests.cpp
 clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp
 clang-tools-extra/clangd/unittests/CompilerTests.cpp
@@ -1378,6 +1555,7 @@
 clang-tools-extra/clangd/unittests/FeatureModulesTests.cpp
 clang-tools-extra/clangd/unittests/FileDistanceTests.cpp
 clang-tools-extra/clangd/unittests/FileIndexTests.cpp
+clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp
 clang-tools-extra/clangd/unittests/FindTargetTests.cpp
 clang-tools-extra/clangd/unittests/FSTests.cpp
 clang-tools-extra/clangd/unittests/FuzzyMatchTests.cpp
@@ -1387,10 +1565,10 @@
 clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
 clang-tools-extra/clangd/unittests/IndexActionTests.cpp
 clang-tools-extra/clangd/unittests/InlayHintTests.cpp
+clang-tools-extra/clangd/unittests/InsertionPointTests.cpp
 clang-tools-extra/clangd/unittests/LoggerTests.cpp
 clang-tools-extra/clangd/unittests/LSPBinderTests.cpp
 clang-tools-extra/clangd/unittests/LSPClient.cpp
-clang-tools-extra/clangd/unittests/LSPClient.h
 clang-tools-extra/clangd/unittests/ModulesTests.cpp
 clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
 clang-tools-extra/clangd/unittests/PreambleTests.cpp
@@ -1417,7 +1595,6 @@
 clang-tools-extra/clangd/unittests/TidyProviderTests.cpp
 clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
 clang-tools-extra/clangd/unittests/URITests.cpp
-clang-tools-extra/clangd/unittests/XRefsTests.cpp
 clang-tools-extra/clangd/unittests/decision_forest_model/CategoricalFeature.h
 clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp
 clang-tools-extra/clangd/unittests/support/CancellationTests.cpp
@@ -1433,7 +1610,6 @@
 clang-tools-extra/clangd/unittests/support/TraceTests.cpp
 clang-tools-extra/clangd/unittests/tweaks/AddUsingTests.cpp
 clang-tools-extra/clangd/unittests/tweaks/AnnotateHighlightingsTests.cpp
-clang-tools-extra/clangd/unittests/tweaks/DefineInlineTests.cpp
 clang-tools-extra/clangd/unittests/tweaks/DefineOutlineTests.cpp
 clang-tools-extra/clangd/unittests/tweaks/DumpASTTests.cpp
 clang-tools-extra/clangd/unittests/tweaks/DumpRecordLayoutTests.cpp
@@ -1477,6 +1653,7 @@
 clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp
 clang-tools-extra/unittests/clang-tidy/OptionsProviderTest.cpp
 clang-tools-extra/unittests/clang-tidy/OverlappingReplacementsTest.cpp
+clang-tools-extra/unittests/clang-tidy/ReadabilityModuleTest.cpp
 clang-tools-extra/unittests/clang-tidy/TransformerClangTidyCheckTest.cpp
 compiler-rt/include/sanitizer/linux_syscall_hooks.h
 compiler-rt/include/sanitizer/memprof_interface.h
@@ -1487,6 +1664,7 @@
 compiler-rt/lib/asan/asan_lock.h
 compiler-rt/lib/asan/asan_mapping.h
 compiler-rt/lib/asan/asan_mapping_sparc64.h
+compiler-rt/lib/asan/asan_rtl_static.cpp
 compiler-rt/lib/asan/tests/asan_globals_test.cpp
 compiler-rt/lib/builtins/fp_extend.h
 compiler-rt/lib/builtins/fp_lib.h
@@ -1504,7 +1682,6 @@
 compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h
 compiler-rt/lib/dfsan/dfsan_flags.h
 compiler-rt/lib/dfsan/dfsan_interceptors.cpp
-compiler-rt/lib/dfsan/dfsan_new_delete.cpp
 compiler-rt/lib/dfsan/dfsan_origin.h
 compiler-rt/lib/dfsan/dfsan_platform.h
 compiler-rt/lib/dfsan/dfsan_thread.h
@@ -1576,6 +1753,7 @@
 compiler-rt/lib/hwasan/hwasan_linux.cpp
 compiler-rt/lib/hwasan/hwasan_poisoning.cpp
 compiler-rt/lib/hwasan/hwasan_poisoning.h
+compiler-rt/lib/hwasan/hwasan_preinit.cpp
 compiler-rt/lib/interception/interception_mac.cpp
 compiler-rt/lib/interception/tests/interception_test_main.cpp
 compiler-rt/lib/lsan/lsan.h
@@ -1597,10 +1775,8 @@
 compiler-rt/lib/memprof/memprof_internal.h
 compiler-rt/lib/memprof/memprof_linux.cpp
 compiler-rt/lib/memprof/memprof_malloc_linux.cpp
-compiler-rt/lib/memprof/memprof_meminfoblock.h
 compiler-rt/lib/memprof/memprof_mibmap.cpp
 compiler-rt/lib/memprof/memprof_mibmap.h
-compiler-rt/lib/memprof/memprof_new_delete.cpp
 compiler-rt/lib/memprof/memprof_posix.cpp
 compiler-rt/lib/memprof/memprof_preinit.cpp
 compiler-rt/lib/memprof/memprof_rawprofile.cpp
@@ -1620,6 +1796,7 @@
 compiler-rt/lib/msan/msan_poisoning.h
 compiler-rt/lib/msan/msan_report.h
 compiler-rt/lib/orc/adt.h
+compiler-rt/lib/orc/debug.h
 compiler-rt/lib/orc/elfnix_platform.cpp
 compiler-rt/lib/orc/elfnix_platform.h
 compiler-rt/lib/orc/endianness.h
@@ -1628,6 +1805,7 @@
 compiler-rt/lib/orc/extensible_rtti.cpp
 compiler-rt/lib/orc/extensible_rtti.h
 compiler-rt/lib/orc/log_error_to_stderr.cpp
+compiler-rt/lib/orc/macho_ehframe_registration.cpp
 compiler-rt/lib/orc/macho_platform.cpp
 compiler-rt/lib/orc/macho_platform.h
 compiler-rt/lib/orc/run_program_wrapper.cpp
@@ -1653,6 +1831,7 @@
 compiler-rt/lib/sanitizer_common/sanitizer_errno.h
 compiler-rt/lib/sanitizer_common/sanitizer_errno_codes.h
 compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h
+compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp
 compiler-rt/lib/sanitizer_common/sanitizer_leb128.h
 compiler-rt/lib/sanitizer_common/sanitizer_local_address_space_view.h
 compiler-rt/lib/sanitizer_common/sanitizer_lzw.h
@@ -1731,7 +1910,6 @@
 compiler-rt/lib/scudo/standalone/vector.h
 compiler-rt/lib/scudo/standalone/wrappers_c.cpp
 compiler-rt/lib/scudo/standalone/wrappers_c.h
-compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
 compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
 compiler-rt/lib/scudo/standalone/wrappers_c_checks.h
 compiler-rt/lib/scudo/standalone/benchmarks/malloc_benchmark.cpp
@@ -1758,7 +1936,6 @@
 compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp
 compiler-rt/lib/scudo/standalone/tests/stats_test.cpp
 compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
-compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
 compiler-rt/lib/scudo/standalone/tests/vector_test.cpp
 compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
 compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -1837,6 +2014,7 @@
 compiler-rt/lib/xray/tests/unit/xray_unit_test_main.cpp
 compiler-rt/tools/gwp_asan/options_parser_fuzzer.cpp
 compiler-rt/tools/gwp_asan/stack_trace_compressor_fuzzer.cpp
+cross-project-tests/debuginfo-tests/clang_llvm_roundtrip/simplified_template_names_noncanonical_type_units.cpp
 cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/dex_and_source/test.cpp
 cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/precompiled_binary/test.cpp
 cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/precompiled_binary_different_dir/source/test.cpp
@@ -1847,15 +2025,16 @@
 cross-project-tests/debuginfo-tests/dexter-tests/realigned-frame.cpp
 cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp
 flang/examples/external-hello.cpp
-flang/examples/flang-omp-report-plugin/flang-omp-report-visitor.cpp
-flang/examples/flang-omp-report-plugin/flang-omp-report-visitor.h
-flang/examples/flang-omp-report-plugin/flang-omp-report.cpp
+flang/examples/FlangOmpReport/FlangOmpReport.cpp
+flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
+flang/examples/FlangOmpReport/FlangOmpReportVisitor.h
 flang/examples/PrintFlangFunctionNames/PrintFlangFunctionNames.cpp
 flang/include/flang/ISO_Fortran_binding.h
 flang/include/flang/Common/bit-population-count.h
 flang/include/flang/Common/constexpr-bitset.h
 flang/include/flang/Common/default-kinds.h
 flang/include/flang/Common/enum-set.h
+flang/include/flang/Common/fast-int-set.h
 flang/include/flang/Common/format.h
 flang/include/flang/Common/Fortran-features.h
 flang/include/flang/Common/Fortran.h
@@ -1903,31 +2082,37 @@
 flang/include/flang/Frontend/FrontendOptions.h
 flang/include/flang/Frontend/FrontendPluginRegistry.h
 flang/include/flang/Frontend/PreprocessorOptions.h
+flang/include/flang/Frontend/TargetOptions.h
 flang/include/flang/Frontend/TextDiagnostic.h
 flang/include/flang/Frontend/TextDiagnosticBuffer.h
 flang/include/flang/Frontend/TextDiagnosticPrinter.h
 flang/include/flang/FrontendTool/Utils.h
 flang/include/flang/Lower/AbstractConverter.h
+flang/include/flang/Lower/Allocatable.h
+flang/include/flang/Lower/BoxAnalyzer.h
 flang/include/flang/Lower/Bridge.h
-flang/include/flang/Lower/CharacterExpr.h
-flang/include/flang/Lower/CharacterRuntime.h
+flang/include/flang/Lower/CallInterface.h
 flang/include/flang/Lower/Coarray.h
-flang/include/flang/Lower/ComplexExpr.h
+flang/include/flang/Lower/ComponentPath.h
+flang/include/flang/Lower/ConvertExpr.h
 flang/include/flang/Lower/ConvertType.h
-flang/include/flang/Lower/DoLoopHelper.h
-flang/include/flang/Lower/FIRBuilder.h
+flang/include/flang/Lower/ConvertVariable.h
+flang/include/flang/Lower/DumpEvaluateExpr.h
+flang/include/flang/Lower/HostAssociations.h
+flang/include/flang/Lower/IntervalSet.h
 flang/include/flang/Lower/IntrinsicCall.h
 flang/include/flang/Lower/IO.h
+flang/include/flang/Lower/IterationSpace.h
 flang/include/flang/Lower/Mangler.h
 flang/include/flang/Lower/OpenACC.h
 flang/include/flang/Lower/OpenMP.h
 flang/include/flang/Lower/PFTBuilder.h
 flang/include/flang/Lower/PFTDefs.h
 flang/include/flang/Lower/Runtime.h
+flang/include/flang/Lower/StatementContext.h
 flang/include/flang/Lower/Todo.h
-flang/include/flang/Lower/Utils.h
-flang/include/flang/Lower/Support/BoxValue.h
 flang/include/flang/Lower/Support/Utils.h
+flang/include/flang/Lower/Support/Verifier.h
 flang/include/flang/Optimizer/Builder/BoxValue.h
 flang/include/flang/Optimizer/Builder/Character.h
 flang/include/flang/Optimizer/Builder/Complex.h
@@ -1937,11 +2122,13 @@
 flang/include/flang/Optimizer/Builder/MutableBox.h
 flang/include/flang/Optimizer/Builder/Runtime/Assign.h
 flang/include/flang/Optimizer/Builder/Runtime/Character.h
+flang/include/flang/Optimizer/Builder/Runtime/Command.h
 flang/include/flang/Optimizer/Builder/Runtime/Derived.h
 flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
 flang/include/flang/Optimizer/Builder/Runtime/Ragged.h
 flang/include/flang/Optimizer/Builder/Runtime/Reduction.h
 flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+flang/include/flang/Optimizer/Builder/Runtime/Stop.h
 flang/include/flang/Optimizer/Builder/Runtime/Transformational.h
 flang/include/flang/Optimizer/CodeGen/CodeGen.h
 flang/include/flang/Optimizer/Dialect/FIRAttr.h
@@ -1984,6 +2171,7 @@
 flang/include/flang/Runtime/descriptor.h
 flang/include/flang/Runtime/entry-names.h
 flang/include/flang/Runtime/extensions.h
+flang/include/flang/Runtime/inquiry.h
 flang/include/flang/Runtime/io-api.h
 flang/include/flang/Runtime/iostat.h
 flang/include/flang/Runtime/main.h
@@ -2053,22 +2241,26 @@
 flang/lib/Frontend/TextDiagnosticBuffer.cpp
 flang/lib/Frontend/TextDiagnosticPrinter.cpp
 flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
-flang/lib/Lower/CharacterExpr.cpp
-flang/lib/Lower/CharacterRuntime.cpp
+flang/lib/Lower/Allocatable.cpp
+flang/lib/Lower/Bridge.cpp
+flang/lib/Lower/CallInterface.cpp
 flang/lib/Lower/Coarray.cpp
-flang/lib/Lower/ComplexExpr.cpp
+flang/lib/Lower/ComponentPath.cpp
 flang/lib/Lower/ConvertExpr.cpp
 flang/lib/Lower/ConvertType.cpp
-flang/lib/Lower/DoLoopHelper.cpp
-flang/lib/Lower/FIRBuilder.cpp
+flang/lib/Lower/ConvertVariable.cpp
+flang/lib/Lower/DumpEvaluateExpr.cpp
 flang/lib/Lower/IntervalSet.h
 flang/lib/Lower/IntrinsicCall.cpp
 flang/lib/Lower/IO.cpp
+flang/lib/Lower/IterationSpace.cpp
 flang/lib/Lower/Mangler.cpp
 flang/lib/Lower/OpenACC.cpp
 flang/lib/Lower/OpenMP.cpp
 flang/lib/Lower/PFTBuilder.cpp
 flang/lib/Lower/RTBuilder.h
+flang/lib/Lower/Runtime.cpp
+flang/lib/Lower/SymbolMap.cpp
 flang/lib/Optimizer/Builder/BoxValue.cpp
 flang/lib/Optimizer/Builder/Character.cpp
 flang/lib/Optimizer/Builder/Complex.cpp
@@ -2077,13 +2269,16 @@
 flang/lib/Optimizer/Builder/MutableBox.cpp
 flang/lib/Optimizer/Builder/Runtime/Assign.cpp
 flang/lib/Optimizer/Builder/Runtime/Character.cpp
+flang/lib/Optimizer/Builder/Runtime/Command.cpp
 flang/lib/Optimizer/Builder/Runtime/Derived.cpp
 flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
 flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
 flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+flang/lib/Optimizer/Builder/Runtime/Stop.cpp
 flang/lib/Optimizer/Builder/Runtime/Transformational.cpp
 flang/lib/Optimizer/CodeGen/CGOps.cpp
 flang/lib/Optimizer/CodeGen/CGOps.h
+flang/lib/Optimizer/CodeGen/CodeGen.cpp
 flang/lib/Optimizer/CodeGen/DescriptorModel.h
 flang/lib/Optimizer/CodeGen/PassDetail.h
 flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -2091,10 +2286,13 @@
 flang/lib/Optimizer/CodeGen/Target.h
 flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
 flang/lib/Optimizer/CodeGen/TypeConverter.h
+flang/lib/Optimizer/Dialect/FIRAttr.cpp
 flang/lib/Optimizer/Dialect/FIRDialect.cpp
 flang/lib/Optimizer/Dialect/FIROps.cpp
 flang/lib/Optimizer/Dialect/FIRType.cpp
+flang/lib/Optimizer/Dialect/Inliner.cpp
 flang/lib/Optimizer/Support/FIRContext.cpp
+flang/lib/Optimizer/Support/InitFIR.cpp
 flang/lib/Optimizer/Support/InternalNames.cpp
 flang/lib/Optimizer/Support/KindMapping.cpp
 flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -2103,7 +2301,6 @@
 flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
 flang/lib/Optimizer/Transforms/CharacterConversion.cpp
 flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp
-flang/lib/Optimizer/Transforms/Inliner.cpp
 flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
 flang/lib/Optimizer/Transforms/MemRefDataFlowOpt.cpp
 flang/lib/Optimizer/Transforms/PassDetail.h
@@ -2157,7 +2354,6 @@
 flang/lib/Semantics/check-allocate.h
 flang/lib/Semantics/check-arithmeticif.cpp
 flang/lib/Semantics/check-arithmeticif.h
-flang/lib/Semantics/check-call.cpp
 flang/lib/Semantics/check-call.h
 flang/lib/Semantics/check-case.cpp
 flang/lib/Semantics/check-case.h
@@ -2167,7 +2363,6 @@
 flang/lib/Semantics/check-data.h
 flang/lib/Semantics/check-deallocate.cpp
 flang/lib/Semantics/check-deallocate.h
-flang/lib/Semantics/check-declarations.cpp
 flang/lib/Semantics/check-declarations.h
 flang/lib/Semantics/check-directive-structure.h
 flang/lib/Semantics/check-do-forall.cpp
@@ -2195,7 +2390,6 @@
 flang/lib/Semantics/compute-offsets.cpp
 flang/lib/Semantics/compute-offsets.h
 flang/lib/Semantics/data-to-inits.cpp
-flang/lib/Semantics/data-to-inits.h
 flang/lib/Semantics/mod-file.h
 flang/lib/Semantics/pointer-assignment.cpp
 flang/lib/Semantics/pointer-assignment.h
@@ -2213,7 +2407,6 @@
 flang/lib/Semantics/runtime-type-info.cpp
 flang/lib/Semantics/scope.cpp
 flang/lib/Semantics/semantics.cpp
-flang/lib/Semantics/symbol.cpp
 flang/lib/Semantics/tools.cpp
 flang/lib/Semantics/unparse-with-symbols.cpp
 flang/module/omp_lib.h
@@ -2248,6 +2441,7 @@
 flang/runtime/format-implementation.h
 flang/runtime/format.cpp
 flang/runtime/format.h
+flang/runtime/inquiry.cpp
 flang/runtime/internal-unit.cpp
 flang/runtime/internal-unit.h
 flang/runtime/io-api.cpp
@@ -2287,8 +2481,8 @@
 flang/runtime/type-info.h
 flang/runtime/unit-map.cpp
 flang/runtime/unit-map.h
-flang/runtime/unit.cpp
 flang/runtime/unit.h
+flang/tools/bbc/bbc.cpp
 flang/tools/f18/dump.cpp
 flang/tools/f18-parse-demo/f18-parse-demo.cpp
 flang/tools/f18-parse-demo/stub-evaluate.cpp
@@ -2296,6 +2490,7 @@
 flang/tools/flang-driver/driver.cpp
 flang/tools/flang-driver/fc1_main.cpp
 flang/tools/tco/tco.cpp
+flang/unittests/Common/FastIntSetTest.cpp
 flang/unittests/Decimal/quick-sanity-test.cpp
 flang/unittests/Decimal/thorough-test.cpp
 flang/unittests/Evaluate/bit-population-count.cpp
@@ -2324,11 +2519,13 @@
 flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
 flang/unittests/Optimizer/Builder/Runtime/AssignTest.cpp
 flang/unittests/Optimizer/Builder/Runtime/CharacterTest.cpp
+flang/unittests/Optimizer/Builder/Runtime/CommandTest.cpp
 flang/unittests/Optimizer/Builder/Runtime/DerivedTest.cpp
 flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp
 flang/unittests/Optimizer/Builder/Runtime/RaggedTest.cpp
 flang/unittests/Optimizer/Builder/Runtime/ReductionTest.cpp
 flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
+flang/unittests/Optimizer/Builder/Runtime/StopTest.cpp
 flang/unittests/Optimizer/Builder/Runtime/TransformationalTest.cpp
 flang/unittests/Runtime/BufferTest.cpp
 flang/unittests/Runtime/CharacterTest.cpp
@@ -2337,6 +2534,7 @@
 flang/unittests/Runtime/CrashHandlerFixture.h
 flang/unittests/Runtime/ExternalIOTest.cpp
 flang/unittests/Runtime/Format.cpp
+flang/unittests/Runtime/Inquiry.cpp
 flang/unittests/Runtime/ListInputTest.cpp
 flang/unittests/Runtime/Matmul.cpp
 flang/unittests/Runtime/MiscIntrinsic.cpp
@@ -2390,8 +2588,38 @@
 libc/fuzzing/string/strcmp_fuzz.cpp
 libc/fuzzing/string/strstr_fuzz.cpp
 libc/include/__llvm-libc-common.h
-libc/include/__llvm-libc-stdc-types.h
-libc/include/__posix-types.h
+libc/include/llvm-libc-macros/fcntl-macros.h
+libc/include/llvm-libc-macros/stdio-macros.h
+libc/include/llvm-libc-macros/linux/fcntl-macros.h
+libc/include/llvm-libc-types/cnd_t.h
+libc/include/llvm-libc-types/div_t.h
+libc/include/llvm-libc-types/double_t.h
+libc/include/llvm-libc-types/fenv_t.h
+libc/include/llvm-libc-types/fexcept_t.h
+libc/include/llvm-libc-types/FILE.h
+libc/include/llvm-libc-types/float_t.h
+libc/include/llvm-libc-types/imaxdiv_t.h
+libc/include/llvm-libc-types/ldiv_t.h
+libc/include/llvm-libc-types/lldiv_t.h
+libc/include/llvm-libc-types/mode_t.h
+libc/include/llvm-libc-types/mtx_t.h
+libc/include/llvm-libc-types/off_t.h
+libc/include/llvm-libc-types/once_flag.h
+libc/include/llvm-libc-types/size_t.h
+libc/include/llvm-libc-types/ssize_t.h
+libc/include/llvm-libc-types/struct_sigaction.h
+libc/include/llvm-libc-types/struct_tm.h
+libc/include/llvm-libc-types/thrd_start_t.h
+libc/include/llvm-libc-types/thrd_t.h
+libc/include/llvm-libc-types/time_t.h
+libc/include/llvm-libc-types/__atexithandler_t.h
+libc/include/llvm-libc-types/__bsearchcompare_t.h
+libc/include/llvm-libc-types/__call_once_func_t.h
+libc/include/llvm-libc-types/__futex_word.h
+libc/include/llvm-libc-types/__mutex_type.h
+libc/include/llvm-libc-types/__qsortcompare_t.h
+libc/include/llvm-libc-types/__sighandler_t.h
+libc/loader/linux/aarch64/start.cpp
 libc/loader/linux/x86_64/start.cpp
 libc/src/assert/__assert_fail.h
 libc/src/ctype/isalnum.cpp
@@ -2428,9 +2656,14 @@
 libc/src/ctype/toupper.h
 libc/src/errno/dummy_errno.cpp
 libc/src/errno/dummy_errno.h
+libc/src/errno/errno.cpp
 libc/src/errno/llvmlibc_errno.h
-libc/src/errno/__errno_location.cpp
-libc/src/errno/__errno_location.h
+libc/src/fcntl/creat.h
+libc/src/fcntl/open.h
+libc/src/fcntl/openat.h
+libc/src/fcntl/linux/creat.cpp
+libc/src/fcntl/linux/open.cpp
+libc/src/fcntl/linux/openat.cpp
 libc/src/fenv/feclearexcept.cpp
 libc/src/fenv/feclearexcept.h
 libc/src/fenv/fedisableexcept.cpp
@@ -2512,6 +2745,9 @@
 libc/src/math/llround.h
 libc/src/math/llroundf.h
 libc/src/math/llroundl.h
+libc/src/math/log10f.h
+libc/src/math/log1pf.h
+libc/src/math/log2f.h
 libc/src/math/logb.h
 libc/src/math/logbf.h
 libc/src/math/logbl.h
@@ -2566,6 +2802,8 @@
 libc/src/math/generic/ceil.cpp
 libc/src/math/generic/ceilf.cpp
 libc/src/math/generic/ceill.cpp
+libc/src/math/generic/common_constants.cpp
+libc/src/math/generic/common_constants.h
 libc/src/math/generic/copysign.cpp
 libc/src/math/generic/copysignf.cpp
 libc/src/math/generic/copysignl.cpp
@@ -2609,6 +2847,9 @@
 libc/src/math/generic/llround.cpp
 libc/src/math/generic/llroundf.cpp
 libc/src/math/generic/llroundl.cpp
+libc/src/math/generic/log10f.cpp
+libc/src/math/generic/log1pf.cpp
+libc/src/math/generic/log2f.cpp
 libc/src/math/generic/logb.cpp
 libc/src/math/generic/logbf.cpp
 libc/src/math/generic/logbl.cpp
@@ -2654,9 +2895,6 @@
 libc/src/math/generic/truncl.cpp
 libc/src/math/x86_64/cos.cpp
 libc/src/math/x86_64/sin.cpp
-libc/src/math/x86_64/sqrt.cpp
-libc/src/math/x86_64/sqrtf.cpp
-libc/src/math/x86_64/sqrtl.cpp
 libc/src/math/x86_64/tan.cpp
 libc/src/signal/raise.h
 libc/src/signal/sigaction.h
@@ -2679,10 +2917,11 @@
 libc/src/stdio/FILE.h
 libc/src/stdio/fwrite.cpp
 libc/src/stdio/fwrite.h
-libc/src/stdlib/abort.cpp
 libc/src/stdlib/abort.h
 libc/src/stdlib/abs.cpp
 libc/src/stdlib/abs.h
+libc/src/stdlib/atexit.cpp
+libc/src/stdlib/atexit.h
 libc/src/stdlib/atof.cpp
 libc/src/stdlib/atof.h
 libc/src/stdlib/atoi.cpp
@@ -2695,6 +2934,10 @@
 libc/src/stdlib/bsearch.h
 libc/src/stdlib/div.cpp
 libc/src/stdlib/div.h
+libc/src/stdlib/exit.cpp
+libc/src/stdlib/exit.h
+libc/src/stdlib/getenv.cpp
+libc/src/stdlib/getenv.h
 libc/src/stdlib/labs.cpp
 libc/src/stdlib/labs.h
 libc/src/stdlib/ldiv.cpp
@@ -2720,6 +2963,7 @@
 libc/src/stdlib/strtoull.cpp
 libc/src/stdlib/strtoull.h
 libc/src/stdlib/_Exit.h
+libc/src/stdlib/linux/abort.cpp
 libc/src/stdlib/linux/_Exit.cpp
 libc/src/string/bcmp.cpp
 libc/src/string/bcmp.h
@@ -2783,7 +3027,6 @@
 libc/src/string/strtok_r.cpp
 libc/src/string/strtok_r.h
 libc/src/string/memory_utils/bcmp_implementations.h
-libc/src/string/memory_utils/elements.h
 libc/src/string/memory_utils/elements_aarch64.h
 libc/src/string/memory_utils/elements_x86.h
 libc/src/string/memory_utils/memcmp_implementations.h
@@ -2793,25 +3036,30 @@
 libc/src/sys/mman/mmap.h
 libc/src/sys/mman/munmap.h
 libc/src/sys/mman/linux/mmap.cpp
-libc/src/sys/mman/linux/munmap.cpp
+libc/src/sys/stat/mkdir.h
+libc/src/sys/stat/mkdirat.h
+libc/src/sys/stat/linux/mkdir.cpp
+libc/src/sys/stat/linux/mkdirat.cpp
 libc/src/threads/call_once.h
 libc/src/threads/cnd_broadcast.h
 libc/src/threads/cnd_destroy.h
 libc/src/threads/cnd_init.h
 libc/src/threads/cnd_signal.h
 libc/src/threads/cnd_wait.h
+libc/src/threads/mtx_destroy.cpp
 libc/src/threads/mtx_destroy.h
+libc/src/threads/mtx_init.cpp
 libc/src/threads/mtx_init.h
+libc/src/threads/mtx_lock.cpp
 libc/src/threads/mtx_lock.h
+libc/src/threads/mtx_unlock.cpp
 libc/src/threads/mtx_unlock.h
 libc/src/threads/thrd_create.h
 libc/src/threads/thrd_join.h
 libc/src/threads/linux/call_once.cpp
 libc/src/threads/linux/CndVar.h
+libc/src/threads/linux/cnd_wait.cpp
 libc/src/threads/linux/Futex.h
-libc/src/threads/linux/mtx_destroy.cpp
-libc/src/threads/linux/mtx_init.cpp
-libc/src/threads/linux/Mutex.h
 libc/src/threads/linux/thrd_create.cpp
 libc/src/threads/linux/thrd_join.cpp
 libc/src/threads/linux/Thread.h
@@ -2827,7 +3075,19 @@
 libc/src/time/mktime.h
 libc/src/time/time_utils.cpp
 libc/src/time/time_utils.h
+libc/src/unistd/close.h
+libc/src/unistd/fsync.h
+libc/src/unistd/read.h
+libc/src/unistd/rmdir.h
+libc/src/unistd/unlink.h
+libc/src/unistd/unlinkat.h
 libc/src/unistd/write.h
+libc/src/unistd/linux/close.cpp
+libc/src/unistd/linux/fsync.cpp
+libc/src/unistd/linux/read.cpp
+libc/src/unistd/linux/rmdir.cpp
+libc/src/unistd/linux/unlink.cpp
+libc/src/unistd/linux/unlinkat.cpp
 libc/src/unistd/linux/write.cpp
 libc/src/__support/architectures.h
 libc/src/__support/common.h
@@ -2841,43 +3101,50 @@
 libc/src/__support/str_to_integer.h
 libc/src/__support/CPP/Array.h
 libc/src/__support/CPP/ArrayRef.h
+libc/src/__support/CPP/atomic.h
+libc/src/__support/CPP/Bit.h
 libc/src/__support/CPP/Bitset.h
 libc/src/__support/CPP/Functional.h
 libc/src/__support/CPP/Limits.h
 libc/src/__support/CPP/StringView.h
-libc/src/__support/CPP/TypeTraits.h
+libc/src/__support/CPP/Utility.h
+libc/src/__support/CPP/vector.h
+libc/src/__support/File/file.cpp
+libc/src/__support/File/file.h
 libc/src/__support/FPUtil/BasicOperations.h
 libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
 libc/src/__support/FPUtil/FEnvImpl.h
-libc/src/__support/FPUtil/FEnvUtils.h
 libc/src/__support/FPUtil/FloatProperties.h
 libc/src/__support/FPUtil/FMA.h
 libc/src/__support/FPUtil/FPBits.h
-libc/src/__support/FPUtil/Hypot.h
 libc/src/__support/FPUtil/ManipulationFunctions.h
 libc/src/__support/FPUtil/NearestIntegerOperations.h
 libc/src/__support/FPUtil/NormalFloat.h
 libc/src/__support/FPUtil/PlatformDefs.h
 libc/src/__support/FPUtil/PolyEval.h
-libc/src/__support/FPUtil/Sqrt.h
+libc/src/__support/FPUtil/sqrt.h
 libc/src/__support/FPUtil/UInt.h
 libc/src/__support/FPUtil/XFloat.h
 libc/src/__support/FPUtil/aarch64/FEnvImpl.h
 libc/src/__support/FPUtil/aarch64/FMA.h
+libc/src/__support/FPUtil/aarch64/sqrt.h
 libc/src/__support/FPUtil/generic/FMA.h
+libc/src/__support/FPUtil/generic/sqrt.h
+libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
 libc/src/__support/FPUtil/x86_64/FEnvImpl.h
 libc/src/__support/FPUtil/x86_64/FMA.h
-libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
 libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
 libc/src/__support/FPUtil/x86_64/PolyEval.h
-libc/src/__support/FPUtil/x86_64/SqrtLongDouble.h
+libc/src/__support/FPUtil/x86_64/sqrt.h
 libc/src/__support/OSUtil/io.h
 libc/src/__support/OSUtil/quick_exit.h
 libc/src/__support/OSUtil/syscall.h
 libc/src/__support/OSUtil/linux/io.h
-libc/src/__support/OSUtil/linux/quick_exit.h
 libc/src/__support/OSUtil/linux/syscall.h
+libc/src/__support/OSUtil/linux/aarch64/syscall.h
 libc/src/__support/OSUtil/linux/x86_64/syscall.h
+libc/src/__support/threads/mutex.h
+libc/src/__support/threads/linux/mutex.h
 libc/utils/HdrGen/Command.cpp
 libc/utils/HdrGen/Command.h
 libc/utils/HdrGen/Generator.cpp
@@ -2909,8 +3176,9 @@
 libc/utils/UnitTest/FPMatcher.h
 libc/utils/UnitTest/FuchsiaTest.h
 libc/utils/UnitTest/LibcTest.cpp
-libc/utils/UnitTest/LibcTest.h
 libc/utils/UnitTest/LibcTestMain.cpp
+libc/utils/UnitTest/MemoryMatcher.cpp
+libc/utils/UnitTest/MemoryMatcher.h
 libc/utils/UnitTest/PlatformDefs.h
 libc/utils/UnitTest/Test.h
 libclc/generic/include/config.h
@@ -3119,6 +3387,7 @@
 libclc/generic/lib/math/ep_log.h
 libcxx/benchmarks/format.bench.cpp
 libcxx/benchmarks/formatted_size.bench.cpp
+libcxx/benchmarks/formatter_float.bench.cpp
 libcxx/benchmarks/format_to.bench.cpp
 libcxx/benchmarks/format_to_n.bench.cpp
 libcxx/benchmarks/to_chars.bench.cpp
@@ -3126,21 +3395,49 @@
 libcxx/benchmarks/variant_visit_1.bench.cpp
 libcxx/benchmarks/variant_visit_2.bench.cpp
 libcxx/benchmarks/variant_visit_3.bench.cpp
+libcxx/include/__algorithm/adjacent_find.h
+libcxx/include/__algorithm/all_of.h
+libcxx/include/__algorithm/any_of.h
+libcxx/include/__algorithm/count.h
+libcxx/include/__algorithm/count_if.h
+libcxx/include/__algorithm/find.h
+libcxx/include/__algorithm/find_first_of.h
+libcxx/include/__algorithm/find_if.h
+libcxx/include/__algorithm/find_if_not.h
+libcxx/include/__algorithm/for_each.h
+libcxx/include/__algorithm/for_each_n.h
+libcxx/include/__algorithm/iter_swap.h
+libcxx/include/__algorithm/mismatch.h
+libcxx/include/__algorithm/none_of.h
+libcxx/include/__algorithm/swap_ranges.h
+libcxx/include/__compare/is_eq.h
 libcxx/include/__filesystem/file_time_type.h
 libcxx/include/__filesystem/file_type.h
 libcxx/include/__filesystem/space_info.h
+libcxx/include/__format/formatter_floating_point.h
+libcxx/include/__format/formatter_pointer.h
 libcxx/include/__memory/voidify.h
 libcxx/include/__numeric/exclusive_scan.h
 libcxx/include/__numeric/inclusive_scan.h
 libcxx/include/__numeric/reduce.h
 libcxx/include/__numeric/transform_reduce.h
+libcxx/include/__random/default_random_engine.h
+libcxx/include/__random/knuth_b.h
+libcxx/include/__ranges/dangling.h
+libcxx/include/__ranges/enable_borrowed_range.h
 libcxx/include/__support/ibm/gettod_zos.h
 libcxx/include/__support/ibm/nanosleep.h
 libcxx/include/__support/openbsd/xlocale.h
 libcxx/include/__support/solaris/floatingpoint.h
 libcxx/include/__support/solaris/wchar.h
+libcxx/include/__utility/auto_cast.h
+libcxx/include/__utility/declval.h
+libcxx/include/__utility/forward.h
+libcxx/include/__utility/move.h
+libcxx/include/__utility/swap.h
 libcxx/src/chrono_system_time_init.h
 libcxx/src/format.cpp
+libcxx/src/ios.instantiations.cpp
 libcxx/src/iostream_init.h
 libcxx/src/legacy_pointer_safety.cpp
 libcxx/src/utility.cpp
@@ -3175,6 +3472,7 @@
 lld/COFF/TypeMerger.h
 lld/COFF/Writer.h
 lld/Common/Args.cpp
+lld/Common/CommonLinkerContext.cpp
 lld/Common/DWARF.cpp
 lld/Common/Memory.cpp
 lld/Common/Reproduce.cpp
@@ -3213,6 +3511,8 @@
 lld/ELF/Arch/SPARCV9.cpp
 lld/include/lld/Common/Args.h
 lld/include/lld/Common/Arrays.h
+lld/include/lld/Common/CommonLinkerContext.h
+lld/include/lld/Common/Driver.h
 lld/include/lld/Common/DWARF.h
 lld/include/lld/Common/Filesystem.h
 lld/include/lld/Common/Strings.h
@@ -3251,6 +3551,8 @@
 lld/MachO/OutputSegment.h
 lld/MachO/Relocations.cpp
 lld/MachO/Relocations.h
+lld/MachO/SectionPriorities.cpp
+lld/MachO/SectionPriorities.h
 lld/MachO/Symbols.cpp
 lld/MachO/Symbols.h
 lld/MachO/SymbolTable.cpp
@@ -3370,7 +3672,6 @@
 lldb/include/lldb/Core/Debugger.h
 lldb/include/lldb/Core/Declaration.h
 lldb/include/lldb/Core/DumpRegisterValue.h
-lldb/include/lldb/Core/dwarf.h
 lldb/include/lldb/Core/EmulateInstruction.h
 lldb/include/lldb/Core/Highlighter.h
 lldb/include/lldb/Core/IOHandlerCursesGUI.h
@@ -3417,6 +3718,7 @@
 lldb/include/lldb/Host/File.h
 lldb/include/lldb/Host/FileAction.h
 lldb/include/lldb/Host/FileSystem.h
+lldb/include/lldb/Host/Host.h
 lldb/include/lldb/Host/HostGetOpt.h
 lldb/include/lldb/Host/HostInfo.h
 lldb/include/lldb/Host/HostNativeProcess.h
@@ -3429,7 +3731,6 @@
 lldb/include/lldb/Host/PseudoTerminal.h
 lldb/include/lldb/Host/SafeMachO.h
 lldb/include/lldb/Host/Socket.h
-lldb/include/lldb/Host/SocketAddress.h
 lldb/include/lldb/Host/Terminal.h
 lldb/include/lldb/Host/Time.h
 lldb/include/lldb/Host/XML.h
@@ -3511,6 +3812,7 @@
 lldb/include/lldb/Target/JITLoader.h
 lldb/include/lldb/Target/JITLoaderList.h
 lldb/include/lldb/Target/MemoryTagManager.h
+lldb/include/lldb/Target/MemoryTagMap.h
 lldb/include/lldb/Target/ModuleCache.h
 lldb/include/lldb/Target/OperatingSystem.h
 lldb/include/lldb/Target/PostMortemProcess.h
@@ -3564,9 +3866,10 @@
 lldb/include/lldb/Utility/FileSpec.h
 lldb/include/lldb/Utility/Flags.h
 lldb/include/lldb/Utility/GDBRemote.h
+lldb/include/lldb/Utility/Instrumentation.h
 lldb/include/lldb/Utility/IOObject.h
 lldb/include/lldb/Utility/LLDBAssert.h
-lldb/include/lldb/Utility/Logging.h
+lldb/include/lldb/Utility/LLDBLog.h
 lldb/include/lldb/Utility/Predicate.h
 lldb/include/lldb/Utility/ProcessInfo.h
 lldb/include/lldb/Utility/RangeMap.h
@@ -3593,14 +3896,45 @@
 lldb/include/lldb/Utility/VASPrintf.h
 lldb/include/lldb/Utility/VMRange.h
 lldb/include/lldb/Version/Version.h
+lldb/source/API/SBAddress.cpp
+lldb/source/API/SBAttachInfo.cpp
+lldb/source/API/SBBroadcaster.cpp
 lldb/source/API/SBCommandInterpreterRunOptions.cpp
+lldb/source/API/SBCommunication.cpp
+lldb/source/API/SBCompileUnit.cpp
+lldb/source/API/SBDebugger.cpp
+lldb/source/API/SBEnvironment.cpp
+lldb/source/API/SBFile.cpp
+lldb/source/API/SBFileSpec.cpp
+lldb/source/API/SBFileSpecList.cpp
+lldb/source/API/SBFunction.cpp
+lldb/source/API/SBHostOS.cpp
+lldb/source/API/SBLanguageRuntime.cpp
+lldb/source/API/SBLaunchInfo.cpp
+lldb/source/API/SBLineEntry.cpp
+lldb/source/API/SBListener.cpp
 lldb/source/API/SBModule.cpp
-lldb/source/API/SBReproducerPrivate.h
+lldb/source/API/SBModuleSpec.cpp
+lldb/source/API/SBProcessInfo.cpp
+lldb/source/API/SBQueueItem.cpp
+lldb/source/API/SBSection.cpp
+lldb/source/API/SBStream.cpp
+lldb/source/API/SBStringList.cpp
+lldb/source/API/SBSymbol.cpp
+lldb/source/API/SBSymbolContext.cpp
+lldb/source/API/SBThreadPlan.cpp
+lldb/source/API/SBTrace.cpp
+lldb/source/API/SBTypeFilter.cpp
+lldb/source/API/SBTypeFormat.cpp
+lldb/source/API/SBUnixSignals.cpp
+lldb/source/API/SBValueList.cpp
+lldb/source/API/SBWatchpoint.cpp
 lldb/source/API/SystemInitializerFull.cpp
 lldb/source/API/SystemInitializerFull.h
 lldb/source/API/Utils.h
 lldb/source/Breakpoint/BreakpointList.cpp
 lldb/source/Breakpoint/BreakpointPrecondition.cpp
+lldb/source/Breakpoint/BreakpointResolverAddress.cpp
 lldb/source/Breakpoint/BreakpointSiteList.cpp
 lldb/source/Breakpoint/StoppointCallbackContext.cpp
 lldb/source/Breakpoint/WatchpointList.cpp
@@ -3666,6 +4000,7 @@
 lldb/source/Core/AddressResolverFileLine.cpp
 lldb/source/Core/Communication.cpp
 lldb/source/Core/Declaration.cpp
+lldb/source/Core/DumpDataExtractor.cpp
 lldb/source/Core/DumpRegisterValue.cpp
 lldb/source/Core/EmulateInstruction.cpp
 lldb/source/Core/FileLineResolver.cpp
@@ -3716,6 +4051,7 @@
 lldb/source/Host/freebsd/HostInfoFreeBSD.cpp
 lldb/source/Host/linux/AbstractSocket.cpp
 lldb/source/Host/linux/Host.cpp
+lldb/source/Host/linux/HostInfoLinux.cpp
 lldb/source/Host/linux/LibcGlue.cpp
 lldb/source/Host/linux/Support.cpp
 lldb/source/Host/macosx/cfcpp/CFCBundle.cpp
@@ -3732,6 +4068,7 @@
 lldb/source/Host/macosx/cfcpp/CoreFoundationCPP.h
 lldb/source/Host/macosx/objcxx/PosixSpawnResponsible.h
 lldb/source/Host/openbsd/HostInfoOpenBSD.cpp
+lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
 lldb/source/Host/posix/FileSystemPosix.cpp
 lldb/source/Host/posix/HostInfoPosix.cpp
 lldb/source/Host/posix/HostThreadPosix.cpp
@@ -3881,7 +4218,6 @@
 lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h
 lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
 lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.h
-lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
 lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.h
 lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
 lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.h
@@ -3963,18 +4299,16 @@
 lldb/source/Plugins/Process/FreeBSD/NativeThreadFreeBSD.cpp
 lldb/source/Plugins/Process/FreeBSD/NativeThreadFreeBSD.h
 lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
-lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.h
 lldb/source/Plugins/Process/FreeBSDKernel/RegisterContextFreeBSDKernel_arm64.cpp
 lldb/source/Plugins/Process/FreeBSDKernel/RegisterContextFreeBSDKernel_arm64.h
 lldb/source/Plugins/Process/FreeBSDKernel/RegisterContextFreeBSDKernel_i386.h
 lldb/source/Plugins/Process/FreeBSDKernel/RegisterContextFreeBSDKernel_x86_64.cpp
 lldb/source/Plugins/Process/FreeBSDKernel/RegisterContextFreeBSDKernel_x86_64.h
+lldb/source/Plugins/Process/FreeBSDKernel/ThreadFreeBSDKernel.cpp
 lldb/source/Plugins/Process/FreeBSDKernel/ThreadFreeBSDKernel.h
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteClientBase.h
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationHistory.h
-lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationReplayServer.cpp
-lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationReplayServer.h
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.h
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h
@@ -3982,6 +4316,9 @@
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h
 lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.h
+lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterFallback.cpp
+lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterFallback.h
+lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemoteLog.h
 lldb/source/Plugins/Process/gdb-remote/ThreadGDBRemote.cpp
 lldb/source/Plugins/Process/gdb-remote/ThreadGDBRemote.h
 lldb/source/Plugins/Process/Linux/IntelPTManager.cpp
@@ -4030,6 +4367,8 @@
 lldb/source/Plugins/Process/POSIX/NativeProcessELF.h
 lldb/source/Plugins/Process/POSIX/ProcessMessage.cpp
 lldb/source/Plugins/Process/POSIX/ProcessMessage.h
+lldb/source/Plugins/Process/POSIX/ProcessPOSIXLog.cpp
+lldb/source/Plugins/Process/POSIX/ProcessPOSIXLog.h
 lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
 lldb/source/Plugins/Process/scripted/ScriptedProcess.h
 lldb/source/Plugins/Process/scripted/ScriptedThread.cpp
@@ -4207,6 +4546,7 @@
 lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
 lldb/source/Plugins/SymbolFile/DWARF/HashedNameToDIE.cpp
 lldb/source/Plugins/SymbolFile/DWARF/HashedNameToDIE.h
+lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h
 lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
 lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.cpp
 lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -4293,6 +4633,7 @@
 lldb/source/Target/Language.cpp
 lldb/source/Target/MemoryHistory.cpp
 lldb/source/Target/MemoryRegionInfo.cpp
+lldb/source/Target/MemoryTagMap.cpp
 lldb/source/Target/ModuleCache.cpp
 lldb/source/Target/OperatingSystem.cpp
 lldb/source/Target/ProcessTrace.cpp
@@ -4302,11 +4643,11 @@
 lldb/source/Target/SectionLoadHistory.cpp
 lldb/source/Target/SectionLoadList.cpp
 lldb/source/Target/StackID.cpp
-lldb/source/Target/Statistics.cpp
 lldb/source/Target/SystemRuntime.cpp
 lldb/source/Target/ThreadCollection.cpp
 lldb/source/Target/ThreadPlanCallFunctionUsingABI.cpp
 lldb/source/Target/ThreadPlanCallOnFunctionExit.cpp
+lldb/source/Target/ThreadPlanCallUserExpression.cpp
 lldb/source/Target/ThreadPlanRunToAddress.cpp
 lldb/source/Target/ThreadPlanShouldStopHere.cpp
 lldb/source/Target/ThreadPlanStepInRange.cpp
@@ -4332,15 +4673,16 @@
 lldb/source/Utility/DataExtractor.cpp
 lldb/source/Utility/Environment.cpp
 lldb/source/Utility/GDBRemote.cpp
+lldb/source/Utility/Instrumentation.cpp
 lldb/source/Utility/IOObject.cpp
 lldb/source/Utility/Listener.cpp
 lldb/source/Utility/LLDBAssert.cpp
+lldb/source/Utility/LLDBLog.cpp
 lldb/source/Utility/NameMatches.cpp
 lldb/source/Utility/PPC64LE_DWARF_Registers.h
 lldb/source/Utility/PPC64_DWARF_Registers.h
 lldb/source/Utility/RegularExpression.cpp
 lldb/source/Utility/Reproducer.cpp
-lldb/source/Utility/ReproducerInstrumentation.cpp
 lldb/source/Utility/ReproducerProvider.cpp
 lldb/source/Utility/State.cpp
 lldb/source/Utility/Status.cpp
@@ -4460,7 +4802,6 @@
 lldb/tools/lldb-vscode/SourceBreakpoint.cpp
 lldb/tools/lldb-vscode/SourceBreakpoint.h
 lldb/tools/lldb-vscode/SourceReference.h
-lldb/tools/lldb-vscode/VSCode.cpp
 lldb/tools/lldb-vscode/VSCode.h
 lldb/tools/lldb-vscode/VSCodeForward.h
 lldb/unittests/gtest_common.h
@@ -4495,13 +4836,16 @@
 lldb/unittests/Host/ProcessLaunchInfoTest.cpp
 lldb/unittests/Host/SocketAddressTest.cpp
 lldb/unittests/Host/SocketTestUtilities.h
+lldb/unittests/Host/ThreadLauncherTest.cpp
 lldb/unittests/Host/linux/HostTest.cpp
 lldb/unittests/Host/linux/SupportTest.cpp
 lldb/unittests/Interpreter/TestOptionValueFileColonLine.cpp
+lldb/unittests/Interpreter/TestRegexCommand.cpp
 lldb/unittests/Language/CLanguages/CLanguagesTest.cpp
 lldb/unittests/Language/Highlighting/HighlighterTest.cpp
 lldb/unittests/ObjectFile/Breakpad/BreakpadRecordsTest.cpp
 lldb/unittests/Platform/PlatformDarwinTest.cpp
+lldb/unittests/Platform/PlatformSiginfoTest.cpp
 lldb/unittests/Process/ProcessEventDataTest.cpp
 lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerLLGSTest.cpp
 lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp
@@ -4541,6 +4885,7 @@
 lldb/unittests/Target/DynamicRegisterInfoTest.cpp
 lldb/unittests/Target/ExecutionContextTest.cpp
 lldb/unittests/Target/FindFileTest.cpp
+lldb/unittests/Target/MemoryTagMapTest.cpp
 lldb/unittests/Target/RemoteAwarePlatformTest.cpp
 lldb/unittests/Target/StackFrameRecognizerTest.cpp
 lldb/unittests/TestingSupport/MockTildeExpressionResolver.cpp
@@ -4635,17 +4980,16 @@
 llvm/include/llvm/ADT/ilist_iterator.h
 llvm/include/llvm/ADT/ilist_node.h
 llvm/include/llvm/ADT/ilist_node_base.h
-llvm/include/llvm/ADT/ilist_node_options.h
 llvm/include/llvm/ADT/IntrusiveRefCntPtr.h
 llvm/include/llvm/ADT/PointerEmbeddedInt.h
 llvm/include/llvm/ADT/ScopeExit.h
 llvm/include/llvm/ADT/Sequence.h
 llvm/include/llvm/ADT/simple_ilist.h
 llvm/include/llvm/ADT/Statistic.h
+llvm/include/llvm/ADT/STLArrayExtras.h
 llvm/include/llvm/ADT/STLForwardCompat.h
 llvm/include/llvm/ADT/StringSet.h
 llvm/include/llvm/ADT/TypeSwitch.h
-llvm/include/llvm/ADT/Waymarking.h
 llvm/include/llvm/Analysis/BlockFrequencyInfo.h
 llvm/include/llvm/Analysis/CFLAliasAnalysisUtils.h
 llvm/include/llvm/Analysis/CFLAndersAliasAnalysis.h
@@ -4669,6 +5013,7 @@
 llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
 llvm/include/llvm/Analysis/InstCount.h
 llvm/include/llvm/Analysis/InstructionSimplify.h
+llvm/include/llvm/Analysis/InstSimplifyFolder.h
 llvm/include/llvm/Analysis/IteratedDominanceFrontier.h
 llvm/include/llvm/Analysis/Lint.h
 llvm/include/llvm/Analysis/LoopCacheAnalysis.h
@@ -4685,9 +5030,9 @@
 llvm/include/llvm/Analysis/ObjCARCUtil.h
 llvm/include/llvm/Analysis/OverflowInstAnalysis.h
 llvm/include/llvm/Analysis/PhiValues.h
-llvm/include/llvm/Analysis/ReleaseModeModelRunner.h
 llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
 llvm/include/llvm/Analysis/ScalarEvolutionDivision.h
+llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
 llvm/include/llvm/Analysis/ScalarEvolutionNormalization.h
 llvm/include/llvm/Analysis/ScopedNoAliasAA.h
 llvm/include/llvm/Analysis/StackLifetime.h
@@ -4700,17 +5045,16 @@
 llvm/include/llvm/Analysis/Utils/TFUtils.h
 llvm/include/llvm/AsmParser/LLToken.h
 llvm/include/llvm/AsmParser/SlotMapping.h
-llvm/include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h
 llvm/include/llvm/BinaryFormat/COFF.h
 llvm/include/llvm/BinaryFormat/Magic.h
 llvm/include/llvm/BinaryFormat/Minidump.h
 llvm/include/llvm/BinaryFormat/MsgPackDocument.h
 llvm/include/llvm/BinaryFormat/MsgPackReader.h
 llvm/include/llvm/BinaryFormat/MsgPackWriter.h
+llvm/include/llvm/BinaryFormat/Swift.h
 llvm/include/llvm/BinaryFormat/WasmTraits.h
 llvm/include/llvm/Bitcode/BitcodeAnalyzer.h
 llvm/include/llvm/Bitcode/BitcodeCommon.h
-llvm/include/llvm/Bitcode/BitcodeConvenience.h
 llvm/include/llvm/CodeGen/AsmPrinter.h
 llvm/include/llvm/CodeGen/AsmPrinterHandler.h
 llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h
@@ -4737,7 +5081,6 @@
 llvm/include/llvm/CodeGen/MachineLoopUtils.h
 llvm/include/llvm/CodeGen/MachineModuleInfoImpls.h
 llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
-llvm/include/llvm/CodeGen/MachineOutliner.h
 llvm/include/llvm/CodeGen/MachinePassManager.h
 llvm/include/llvm/CodeGen/MachineRegionInfo.h
 llvm/include/llvm/CodeGen/MachineSSAContext.h
@@ -4751,6 +5094,7 @@
 llvm/include/llvm/CodeGen/PBQPRAConstraint.h
 llvm/include/llvm/CodeGen/PreISelIntrinsicLowering.h
 llvm/include/llvm/CodeGen/RegisterBank.h
+llvm/include/llvm/CodeGen/RegisterBankInfo.h
 llvm/include/llvm/CodeGen/RegisterClassInfo.h
 llvm/include/llvm/CodeGen/ReplaceWithVeclib.h
 llvm/include/llvm/CodeGen/ScheduleDAGMutation.h
@@ -4909,9 +5253,11 @@
 llvm/include/llvm/DebugInfo/PDB/Native/RawError.h
 llvm/include/llvm/DebugInfo/PDB/Native/RawTypes.h
 llvm/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
+llvm/include/llvm/DebugInfo/Symbolize/DIFetcher.h
 llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h
 llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
 llvm/include/llvm/Debuginfod/Debuginfod.h
+llvm/include/llvm/Debuginfod/DIFetcher.h
 llvm/include/llvm/Debuginfod/HTTPClient.h
 llvm/include/llvm/Demangle/Demangle.h
 llvm/include/llvm/Demangle/StringView.h
@@ -4926,6 +5272,7 @@
 llvm/include/llvm/ExecutionEngine/GenericValue.h
 llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h
 llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
+llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h
 llvm/include/llvm/ExecutionEngine/JITLink/ELF.h
 llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch64.h
 llvm/include/llvm/ExecutionEngine/JITLink/ELF_riscv.h
@@ -4960,6 +5307,7 @@
 llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
 llvm/include/llvm/ExecutionEngine/Orc/Mangling.h
 llvm/include/llvm/ExecutionEngine/Orc/ObjectFileInterface.h
+llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
 llvm/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
 llvm/include/llvm/ExecutionEngine/Orc/OrcABISupport.h
 llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -5022,6 +5370,7 @@
 llvm/include/llvm/MC/MCAsmInfoXCOFF.h
 llvm/include/llvm/MC/MCAsmLayout.h
 llvm/include/llvm/MC/MCCodeView.h
+llvm/include/llvm/MC/MCContext.h
 llvm/include/llvm/MC/MCFixedLenDisassembler.h
 llvm/include/llvm/MC/MCLabel.h
 llvm/include/llvm/MC/MCObjectWriter.h
@@ -5063,18 +5412,19 @@
 llvm/include/llvm/MCA/Stages/MicroOpQueueStage.h
 llvm/include/llvm/MCA/Stages/RetireStage.h
 llvm/include/llvm/MCA/Stages/Stage.h
-llvm/include/llvm/ObjCopy/MultiFormatConfig.h
-llvm/include/llvm/ObjCopy/ConfigManager.h
 llvm/include/llvm/ObjCopy/CommonConfig.h
+llvm/include/llvm/ObjCopy/MultiFormatConfig.h
 llvm/include/llvm/ObjCopy/ObjCopy.h
-llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
-llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
+llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
+llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
 llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
 llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
-llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
 llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
-llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
-llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
+llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
+llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
+llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
+llvm/include/llvm/ObjCopy/XCOFF/XCOFFConfig.h
+llvm/include/llvm/ObjCopy/XCOFF/XCOFFObjcopy.h
 llvm/include/llvm/Object/Archive.h
 llvm/include/llvm/Object/COFFModuleDefinition.h
 llvm/include/llvm/Object/Decompressor.h
@@ -5140,6 +5490,9 @@
 llvm/include/llvm/Support/CFGUpdate.h
 llvm/include/llvm/Support/CodeGenCoverage.h
 llvm/include/llvm/Support/CRC.h
+llvm/include/llvm/Support/CSKYAttributeParser.h
+llvm/include/llvm/Support/CSKYAttributes.h
+llvm/include/llvm/Support/CSKYTargetParser.h
 llvm/include/llvm/Support/DataTypes.h
 llvm/include/llvm/Support/DebugCounter.h
 llvm/include/llvm/Support/Discriminator.h
@@ -5179,17 +5532,16 @@
 llvm/include/llvm/Support/SymbolRemappingReader.h
 llvm/include/llvm/Support/SystemUtils.h
 llvm/include/llvm/Support/TargetParser.h
-llvm/include/llvm/Support/TimeProfiler.h
 llvm/include/llvm/Support/TrailingObjects.h
 llvm/include/llvm/Support/Unicode.h
 llvm/include/llvm/Support/UnicodeCharRanges.h
 llvm/include/llvm/Support/VersionTuple.h
-llvm/include/llvm/Support/VirtualFileSystem.h
 llvm/include/llvm/Support/WindowsError.h
 llvm/include/llvm/Support/WithColor.h
 llvm/include/llvm/Support/FileSystem/UniqueID.h
 llvm/include/llvm/Support/Solaris/sys/regset.h
 llvm/include/llvm/TableGen/DirectiveEmitter.h
+llvm/include/llvm/TableGen/Parser.h
 llvm/include/llvm/TableGen/StringToOffsetTable.h
 llvm/include/llvm/Target/CGPassBuilderOption.h
 llvm/include/llvm/Target/CodeGenCWrappers.h
@@ -5223,6 +5575,7 @@
 llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
 llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
 llvm/include/llvm/Transforms/IPO/Annotation2Metadata.h
+llvm/include/llvm/Transforms/IPO/ArgumentPromotion.h
 llvm/include/llvm/Transforms/IPO/Attributor.h
 llvm/include/llvm/Transforms/IPO/BlockExtractor.h
 llvm/include/llvm/Transforms/IPO/CalledValuePropagation.h
@@ -5323,6 +5676,7 @@
 llvm/include/llvm/Transforms/Utils/LowerSwitch.h
 llvm/include/llvm/Transforms/Utils/MatrixUtils.h
 llvm/include/llvm/Transforms/Utils/Mem2Reg.h
+llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
 llvm/include/llvm/Transforms/Utils/MetaRenamer.h
 llvm/include/llvm/Transforms/Utils/NameAnonGlobals.h
 llvm/include/llvm/Transforms/Utils/RelLookupTableConverter.h
@@ -5340,7 +5694,6 @@
 llvm/include/llvm/Transforms/Utils/UnifyLoopExits.h
 llvm/include/llvm/Transforms/Utils/ValueMapper.h
 llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
-llvm/include/llvm/WindowsDriver/MSVCPaths.h
 llvm/include/llvm/WindowsDriver/MSVCSetupApi.h
 llvm/include/llvm/WindowsManifest/WindowsManifestMerger.h
 llvm/include/llvm/WindowsResource/ResourceScriptToken.h
@@ -5413,6 +5766,7 @@
 llvm/lib/Analysis/ValueLatticeUtils.cpp
 llvm/lib/Analysis/VFABIDemangling.cpp
 llvm/lib/AsmParser/Parser.cpp
+llvm/lib/BinaryFormat/COFF.cpp
 llvm/lib/BinaryFormat/ELF.cpp
 llvm/lib/BinaryFormat/MachO.cpp
 llvm/lib/BinaryFormat/Magic.cpp
@@ -5437,6 +5791,7 @@
 llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
 llvm/lib/CodeGen/GCMetadataPrinter.cpp
 llvm/lib/CodeGen/IndirectBrExpandPass.cpp
+llvm/lib/CodeGen/JMCInstrumenter.cpp
 llvm/lib/CodeGen/LiveDebugVariables.h
 llvm/lib/CodeGen/LiveIntervalCalc.cpp
 llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -5462,6 +5817,7 @@
 llvm/lib/CodeGen/MIRSampleProfile.cpp
 llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
 llvm/lib/CodeGen/MIRYamlMapping.cpp
+llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
 llvm/lib/CodeGen/MultiHazardRecognizer.cpp
 llvm/lib/CodeGen/NonRelocatableStringpool.cpp
 llvm/lib/CodeGen/ParallelCG.cpp
@@ -5470,6 +5826,7 @@
 llvm/lib/CodeGen/RegAllocBase.cpp
 llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
 llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
+llvm/lib/CodeGen/RegAllocGreedy.h
 llvm/lib/CodeGen/RegAllocScore.cpp
 llvm/lib/CodeGen/RegAllocScore.h
 llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
@@ -5509,7 +5866,6 @@
 llvm/lib/CodeGen/GlobalISel/Localizer.cpp
 llvm/lib/CodeGen/GlobalISel/LostDebugLocObserver.cpp
 llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
-llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
 llvm/lib/CodeGen/MIRParser/MILexer.h
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
 llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h
@@ -5557,6 +5913,7 @@
 llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
 llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
 llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
+llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
 llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
 llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
 llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
@@ -5636,9 +5993,11 @@
 llvm/lib/DebugInfo/PDB/Native/SymbolStream.cpp
 llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp
 llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+llvm/lib/DebugInfo/Symbolize/DIFetcher.cpp
 llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
 llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
 llvm/lib/Debuginfod/Debuginfod.cpp
+llvm/lib/Debuginfod/DIFetcher.cpp
 llvm/lib/Debuginfod/HTTPClient.cpp
 llvm/lib/Demangle/Demangle.cpp
 llvm/lib/Demangle/DLangDemangle.cpp
@@ -5652,10 +6011,8 @@
 llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
 llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
 llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h
-llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
 llvm/lib/ExecutionEngine/JITLink/ELF.cpp
 llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.cpp
-llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
 llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
 llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
 llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -5689,6 +6046,7 @@
 llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
 llvm/lib/ExecutionEngine/Orc/Speculation.cpp
 llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
+llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
 llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp
 llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
 llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
@@ -5779,38 +6137,43 @@
 llvm/lib/MCA/Stages/RetireStage.cpp
 llvm/lib/MCA/Stages/Stage.cpp
 llvm/lib/ObjCopy/Archive.cpp
-llvm/lib/ObjCopy/ConfigManager.cpp
-llvm/lib/ObjCopy/ObjCopy.cpp
 llvm/lib/ObjCopy/Archive.h
-llvm/lib/ObjCopy/wasm/Reader.cpp
-llvm/lib/ObjCopy/wasm/Reader.h
-llvm/lib/ObjCopy/wasm/Object.cpp
-llvm/lib/ObjCopy/wasm/Writer.cpp
-llvm/lib/ObjCopy/wasm/Writer.h
-llvm/lib/ObjCopy/wasm/Object.h
-llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
-llvm/lib/ObjCopy/ELF/Object.cpp
-llvm/lib/ObjCopy/MachO/MachOWriter.cpp
-llvm/lib/ObjCopy/MachO/Object.cpp
-llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
-llvm/lib/ObjCopy/MachO/MachOWriter.h
-llvm/lib/ObjCopy/MachO/MachOReader.h
-llvm/lib/ObjCopy/MachO/MachOReader.cpp
-llvm/lib/ObjCopy/MachO/Object.h
+llvm/lib/ObjCopy/ConfigManager.cpp
+llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
+llvm/lib/ObjCopy/COFF/COFFObject.cpp
+llvm/lib/ObjCopy/COFF/COFFObject.h
+llvm/lib/ObjCopy/COFF/COFFReader.cpp
+llvm/lib/ObjCopy/COFF/COFFReader.h
+llvm/lib/ObjCopy/COFF/COFFWriter.cpp
+llvm/lib/ObjCopy/COFF/COFFWriter.h
+llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+llvm/lib/ObjCopy/ELF/ELFObject.cpp
+llvm/lib/ObjCopy/ELF/ELFObject.h
 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
 llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
-llvm/lib/ObjCopy/COFF/Reader.cpp
-llvm/lib/ObjCopy/COFF/Reader.h
-llvm/lib/ObjCopy/COFF/Object.cpp
-llvm/lib/ObjCopy/COFF/Writer.cpp
-llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
-llvm/lib/ObjCopy/COFF/Writer.h
-llvm/lib/ObjCopy/COFF/Object.h
+llvm/lib/ObjCopy/MachO/MachOObject.cpp
+llvm/lib/ObjCopy/MachO/MachOObject.h
+llvm/lib/ObjCopy/MachO/MachOReader.cpp
+llvm/lib/ObjCopy/MachO/MachOReader.h
+llvm/lib/ObjCopy/MachO/MachOWriter.cpp
+llvm/lib/ObjCopy/MachO/MachOWriter.h
+llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
+llvm/lib/ObjCopy/wasm/WasmObject.cpp
+llvm/lib/ObjCopy/wasm/WasmObject.h
+llvm/lib/ObjCopy/wasm/WasmReader.cpp
+llvm/lib/ObjCopy/wasm/WasmReader.h
+llvm/lib/ObjCopy/wasm/WasmWriter.cpp
+llvm/lib/ObjCopy/wasm/WasmWriter.h
+llvm/lib/ObjCopy/XCOFF/XCOFFObject.h
+llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp
+llvm/lib/ObjCopy/XCOFF/XCOFFReader.h
 llvm/lib/Object/Archive.cpp
 llvm/lib/Object/Binary.cpp
 llvm/lib/Object/Decompressor.cpp
 llvm/lib/Object/FaultMapParser.cpp
 llvm/lib/Object/IRObjectFile.cpp
+llvm/lib/Object/IRSymtab.cpp
 llvm/lib/Object/MachOUniversalWriter.cpp
 llvm/lib/Object/Minidump.cpp
 llvm/lib/Object/ModuleSymbolTable.cpp
@@ -5835,7 +6198,7 @@
 llvm/lib/Passes/PassPlugin.cpp
 llvm/lib/ProfileData/GCOV.cpp
 llvm/lib/ProfileData/InstrProfCorrelator.cpp
-llvm/lib/ProfileData/RawMemProfReader.cpp
+llvm/lib/ProfileData/MemProf.cpp
 llvm/lib/ProfileData/SampleProfWriter.cpp
 llvm/lib/Remarks/BitstreamRemarkParser.h
 llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
@@ -5862,6 +6225,9 @@
 llvm/lib/Support/COM.cpp
 llvm/lib/Support/Compression.cpp
 llvm/lib/Support/CRC.cpp
+llvm/lib/Support/CSKYAttributeParser.cpp
+llvm/lib/Support/CSKYAttributes.cpp
+llvm/lib/Support/CSKYTargetParser.cpp
 llvm/lib/Support/DebugOptions.h
 llvm/lib/Support/DivisionByConstantInfo.cpp
 llvm/lib/Support/DJB.cpp
@@ -5905,12 +6271,16 @@
 llvm/lib/Support/VersionTuple.cpp
 llvm/lib/Support/Watchdog.cpp
 llvm/lib/Support/WithColor.cpp
+llvm/lib/TableGen/Parser.cpp
+llvm/lib/TableGen/RecordContext.h
 llvm/lib/TableGen/TableGenBackendSkeleton.cpp
+llvm/lib/Target/TargetIntrinsicInfo.cpp
 llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
 llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
-llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
 llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
 llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
+llvm/lib/Target/AArch64/AArch64StackTagging.cpp
 llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
 llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
 llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
@@ -5941,7 +6311,6 @@
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
 llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
 llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
-llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
 llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
 llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
 llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -5969,10 +6338,9 @@
 llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
 llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.h
 llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
-llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
 llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
-llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
-llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
 llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
 llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
 llvm/lib/Target/ARC/ARC.h
@@ -6019,7 +6387,6 @@
 llvm/lib/Target/AVR/AVR.h
 llvm/lib/Target/AVR/AVRAsmPrinter.cpp
 llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
-llvm/lib/Target/AVR/AVRFrameLowering.cpp
 llvm/lib/Target/AVR/AVRFrameLowering.h
 llvm/lib/Target/AVR/AVRInstrInfo.cpp
 llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -6052,14 +6419,12 @@
 llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
-llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
-llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
 llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
 llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
 llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -6079,6 +6444,9 @@
 llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
 llvm/lib/Target/CSKY/CSKYAsmPrinter.h
 llvm/lib/Target/CSKY/CSKYCallingConv.h
+llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+llvm/lib/Target/CSKY/CSKYConstantPoolValue.cpp
+llvm/lib/Target/CSKY/CSKYConstantPoolValue.h
 llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
 llvm/lib/Target/CSKY/CSKYFrameLowering.h
 llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -6096,6 +6464,7 @@
 llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
 llvm/lib/Target/CSKY/CSKYTargetMachine.h
 llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
+llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
 llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
 llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
 llvm/lib/Target/CSKY/MCTargetDesc/CSKYBaseInfo.h
@@ -6110,7 +6479,6 @@
 llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
 llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.h
 llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
-llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
 llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
 llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h
 llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h
@@ -6162,6 +6530,39 @@
 llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
 llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
 llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h
+llvm/lib/Target/LoongArch/LoongArch.h
+llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
+llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
+llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
+llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
+llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
+llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
+llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
+llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+llvm/lib/Target/LoongArch/LoongArchTargetMachine.h
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
+llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
+llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h
 llvm/lib/Target/M68k/M68k.h
 llvm/lib/Target/M68k/M68kAsmPrinter.cpp
 llvm/lib/Target/M68k/M68kAsmPrinter.h
@@ -6206,7 +6607,6 @@
 llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
 llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h
 llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
-llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
 llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
 llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.h
 llvm/lib/Target/Mips/Mips16RegisterInfo.h
@@ -6253,6 +6653,7 @@
 llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
 llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h
 llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
+llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
 llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
 llvm/lib/Target/PowerPC/PPCTargetMachine.h
 llvm/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -6272,17 +6673,16 @@
 llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
 llvm/lib/Target/RISCV/RISCVCallLowering.cpp
 llvm/lib/Target/RISCV/RISCVCallLowering.h
-llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
 llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
-llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
-llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
 llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp
 llvm/lib/Target/RISCV/RISCVLegalizerInfo.cpp
 llvm/lib/Target/RISCV/RISCVLegalizerInfo.h
 llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
 llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp
 llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp
 llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h
+llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
 llvm/lib/Target/RISCV/RISCVTargetMachine.h
 llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
 llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
@@ -6326,11 +6726,12 @@
 llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h
 llvm/lib/Target/VE/LVLGen.cpp
 llvm/lib/Target/VE/VEAsmPrinter.cpp
+llvm/lib/Target/VE/VECustomDAG.cpp
+llvm/lib/Target/VE/VECustomDAG.h
 llvm/lib/Target/VE/VEFrameLowering.h
 llvm/lib/Target/VE/VEInstrBuilder.h
 llvm/lib/Target/VE/VEInstrInfo.h
 llvm/lib/Target/VE/VEISelDAGToDAG.cpp
-llvm/lib/Target/VE/VEISelLowering.cpp
 llvm/lib/Target/VE/VEMachineFunctionInfo.cpp
 llvm/lib/Target/VE/VEMachineFunctionInfo.h
 llvm/lib/Target/VE/VEMCInstLower.cpp
@@ -6341,6 +6742,7 @@
 llvm/lib/Target/VE/VETargetMachine.cpp
 llvm/lib/Target/VE/VETargetMachine.h
 llvm/lib/Target/VE/VETargetTransformInfo.h
+llvm/lib/Target/VE/VVPISelLowering.cpp
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
 llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
 llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -6422,6 +6824,7 @@
 llvm/lib/Target/X86/X86FastTileConfig.cpp
 llvm/lib/Target/X86/X86InsertPrefetch.cpp
 llvm/lib/Target/X86/X86InsertWait.cpp
+llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
 llvm/lib/Target/X86/X86InterleavedAccess.cpp
 llvm/lib/Target/X86/X86LegalizerInfo.h
 llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -6435,6 +6838,8 @@
 llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
 llvm/lib/Target/X86/X86TargetMachine.h
 llvm/lib/Target/X86/X86TileConfig.cpp
+llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
+llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
 llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
 llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
 llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -6450,7 +6855,6 @@
 llvm/lib/Testing/Support/Annotations.cpp
 llvm/lib/Testing/Support/Error.cpp
 llvm/lib/Testing/Support/SupportHelpers.cpp
-llvm/lib/TextAPI/Architecture.cpp
 llvm/lib/TextAPI/ArchitectureSet.cpp
 llvm/lib/TextAPI/InterfaceFile.cpp
 llvm/lib/TextAPI/PackedVersion.cpp
@@ -6460,12 +6864,10 @@
 llvm/lib/TextAPI/TextAPIContext.h
 llvm/lib/TextAPI/TextStub.cpp
 llvm/lib/TextAPI/TextStubCommon.cpp
-llvm/lib/TextAPI/TextStubCommon.h
 llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
 llvm/lib/Transforms/CFGuard/CFGuard.cpp
 llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
 llvm/lib/Transforms/Instrumentation/CFGMST.h
-llvm/lib/Transforms/Instrumentation/CGProfile.cpp
 llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
 llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
 llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -6474,7 +6876,6 @@
 llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
 llvm/lib/Transforms/IPO/Annotation2Metadata.cpp
 llvm/lib/Transforms/IPO/Attributor.cpp
-llvm/lib/Transforms/IPO/AttributorAttributes.cpp
 llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
 llvm/lib/Transforms/IPO/ModuleInliner.cpp
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -6515,6 +6916,7 @@
 llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
 llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
 llvm/lib/Transforms/Utils/MatrixUtils.cpp
+llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
 llvm/lib/Transforms/Utils/SampleProfileInference.cpp
 llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
 llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -6583,9 +6985,9 @@
 llvm/tools/llvm-cov/SourceCoverageViewText.h
 llvm/tools/llvm-cov/TestingSupport.cpp
 llvm/tools/llvm-cxxdump/Error.cpp
-llvm/tools/llvm-cxxdump/llvm-cxxdump.h
 llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
 llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp
+llvm/tools/llvm-dis-fuzzer/llvm-dis-fuzzer.cpp
 llvm/tools/llvm-dlang-demangle-fuzzer/DummyDemanglerFuzzer.cpp
 llvm/tools/llvm-dlang-demangle-fuzzer/llvm-dlang-demangle-fuzzer.cpp
 llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -6670,7 +7072,6 @@
 llvm/tools/llvm-ml/Disassembler.h
 llvm/tools/llvm-modextract/llvm-modextract.cpp
 llvm/tools/llvm-objcopy/llvm-objcopy.cpp
-llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
 llvm/tools/llvm-objcopy/ObjcopyOptions.h
 llvm/tools/llvm-objdump/COFFDump.h
 llvm/tools/llvm-objdump/ELFDump.h
@@ -6703,7 +7104,6 @@
 llvm/tools/llvm-profgen/llvm-profgen.cpp
 llvm/tools/llvm-profgen/PerfReader.cpp
 llvm/tools/llvm-profgen/PerfReader.h
-llvm/tools/llvm-profgen/ProfileGenerator.h
 llvm/tools/llvm-rc/ResourceScriptCppFilter.cpp
 llvm/tools/llvm-rc/ResourceScriptCppFilter.h
 llvm/tools/llvm-rc/ResourceScriptParser.h
@@ -6715,7 +7115,6 @@
 llvm/tools/llvm-readobj/WindowsResourceDumper.h
 llvm/tools/llvm-reduce/DeltaManager.cpp
 llvm/tools/llvm-reduce/DeltaManager.h
-llvm/tools/llvm-reduce/llvm-reduce.cpp
 llvm/tools/llvm-reduce/ReducerWorkItem.cpp
 llvm/tools/llvm-reduce/ReducerWorkItem.h
 llvm/tools/llvm-reduce/TestRunner.cpp
@@ -6759,12 +7158,12 @@
 llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.h
 llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp
 llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.h
+llvm/tools/llvm-remark-size-diff/RemarkSizeDiff.cpp
 llvm/tools/llvm-rust-demangle-fuzzer/DummyDemanglerFuzzer.cpp
 llvm/tools/llvm-rust-demangle-fuzzer/llvm-rust-demangle-fuzzer.cpp
 llvm/tools/llvm-shlib/libllvm.cpp
 llvm/tools/llvm-special-case-list-fuzzer/DummySpecialCaseListFuzzer.cpp
 llvm/tools/llvm-special-case-list-fuzzer/special-case-list-fuzzer.cpp
-llvm/tools/llvm-split/llvm-split.cpp
 llvm/tools/llvm-strings/llvm-strings.cpp
 llvm/tools/llvm-tapi-diff/DiffEngine.cpp
 llvm/tools/llvm-tapi-diff/DiffEngine.h
@@ -6810,7 +7209,6 @@
 llvm/unittests/ADT/DirectedGraphTest.cpp
 llvm/unittests/ADT/EnumeratedArrayTest.cpp
 llvm/unittests/ADT/FallibleIteratorTest.cpp
-llvm/unittests/ADT/FunctionExtrasTest.cpp
 llvm/unittests/ADT/FunctionRefTest.cpp
 llvm/unittests/ADT/IListBaseTest.cpp
 llvm/unittests/ADT/IListNodeBaseTest.cpp
@@ -6821,7 +7219,6 @@
 llvm/unittests/ADT/ScopeExitTest.cpp
 llvm/unittests/ADT/SequenceTest.cpp
 llvm/unittests/ADT/SetVectorTest.cpp
-llvm/unittests/ADT/SimpleIListTest.cpp
 llvm/unittests/ADT/SmallSetTest.cpp
 llvm/unittests/ADT/SparseMultiSetTest.cpp
 llvm/unittests/ADT/SparseSetTest.cpp
@@ -6832,7 +7229,6 @@
 llvm/unittests/ADT/StringSwitchTest.cpp
 llvm/unittests/ADT/TypeSwitchTest.cpp
 llvm/unittests/ADT/TypeTraitsTest.cpp
-llvm/unittests/ADT/WaymarkingTest.cpp
 llvm/unittests/Analysis/BasicAliasAnalysisTest.cpp
 llvm/unittests/Analysis/BlockFrequencyInfoTest.cpp
 llvm/unittests/Analysis/BranchProbabilityInfoTest.cpp
@@ -6855,6 +7251,7 @@
 llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
 llvm/unittests/Bitstream/BitstreamWriterTest.cpp
 llvm/unittests/CodeGen/AllocationOrderTest.cpp
+llvm/unittests/CodeGen/AMDGPUMetadataTest.cpp
 llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
 llvm/unittests/CodeGen/DIETest.cpp
 llvm/unittests/CodeGen/LexicalScopesTest.cpp
@@ -6870,6 +7267,7 @@
 llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp
 llvm/unittests/DebugInfo/DWARF/DWARFDataExtractorTest.cpp
 llvm/unittests/DebugInfo/DWARF/DWARFDebugArangeSetTest.cpp
+llvm/unittests/DebugInfo/DWARF/DWARFDebugFrameTest.cpp
 llvm/unittests/DebugInfo/DWARF/DWARFDieManualExtractTest.cpp
 llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp
 llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp
@@ -6909,12 +7307,12 @@
 llvm/unittests/ExecutionEngine/Orc/ThreadSafeModuleTest.cpp
 llvm/unittests/Frontend/OpenACCTest.cpp
 llvm/unittests/Frontend/OpenMPContextTest.cpp
-llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
 llvm/unittests/Frontend/OpenMPParsingTest.cpp
 llvm/unittests/InterfaceStub/ELFYAMLTest.cpp
 llvm/unittests/IR/DemandedBitsTest.cpp
 llvm/unittests/IR/ManglerTest.cpp
 llvm/unittests/IR/ModuleTest.cpp
+llvm/unittests/IR/TimePassesTest.cpp
 llvm/unittests/IR/UseTest.cpp
 llvm/unittests/IR/VectorTypesTest.cpp
 llvm/unittests/MC/Disassembler.cpp
@@ -6940,6 +7338,7 @@
 llvm/unittests/Passes/TestPlugin.cpp
 llvm/unittests/Passes/TestPlugin.h
 llvm/unittests/ProfileData/InstrProfDataTest.cpp
+llvm/unittests/ProfileData/MemProfTest.cpp
 llvm/unittests/Remarks/BitstreamRemarksFormatTest.cpp
 llvm/unittests/Remarks/BitstreamRemarksParsingTest.cpp
 llvm/unittests/Remarks/RemarksLinkingTest.cpp
@@ -6950,6 +7349,8 @@
 llvm/unittests/Support/Base64Test.cpp
 llvm/unittests/Support/buffer_ostream_test.cpp
 llvm/unittests/Support/Chrono.cpp
+llvm/unittests/Support/CSKYAttributeParserTest.cpp
+llvm/unittests/Support/CSKYTargetParserTest.cpp
 llvm/unittests/Support/DebugCounterTest.cpp
 llvm/unittests/Support/DJBTest.cpp
 llvm/unittests/Support/ELFAttributeParserTest.cpp
@@ -6979,10 +7380,13 @@
 llvm/unittests/Support/WithColorTest.cpp
 llvm/unittests/Support/xxhashTest.cpp
 llvm/unittests/Support/CommandLineInit/CommandLineInitTest.cpp
+llvm/unittests/TableGen/ParserEntryPointTest.cpp
 llvm/unittests/Target/AArch64/MatrixRegisterAliasing.cpp
 llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
 llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
+llvm/unittests/Target/ARM/InstSizes.cpp
 llvm/unittests/Target/PowerPC/AIXRelocModelTest.cpp
+llvm/unittests/Testing/Support/TempPathTest.cpp
 llvm/unittests/TextAPI/TextStubHelpers.h
 llvm/unittests/TextAPI/TextStubV1Tests.cpp
 llvm/unittests/TextAPI/TextStubV2Tests.cpp
@@ -7005,13 +7409,13 @@
 llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
 llvm/unittests/Transforms/IPO/AttributorTest.cpp
 llvm/unittests/Transforms/IPO/AttributorTestBase.h
+llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
 llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
 llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
 llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
 llvm/unittests/Transforms/Utils/ModuleUtilsTest.cpp
 llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
 llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
-llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
 llvm/unittests/Transforms/Utils/VFABIUtils.cpp
 llvm/unittests/Transforms/Vectorize/VPlanDominatorTreeTest.cpp
 llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -7038,6 +7442,8 @@
 llvm/utils/TableGen/OptRSTEmitter.cpp
 llvm/utils/TableGen/PredicateExpander.h
 llvm/utils/TableGen/SDNodeProperties.cpp
+llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
+llvm/utils/TableGen/VarLenCodeEmitterGen.h
 llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h
 llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
 llvm/utils/TableGen/GlobalISel/CodeExpander.h
@@ -7045,6 +7451,8 @@
 llvm/utils/TableGen/GlobalISel/GIMatchDagEdge.cpp
 llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.cpp
 llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.h
+llvm/utils/TableGen/GlobalISel/GIMatchDagPredicate.cpp
+llvm/utils/TableGen/GlobalISel/GIMatchDagPredicate.h
 llvm/utils/TableGen/GlobalISel/GIMatchDagPredicateDependencyEdge.cpp
 llvm/utils/TableGen/GlobalISel/GIMatchDagPredicateDependencyEdge.h
 mlir/examples/standalone/include/Standalone/StandaloneDialect.h
@@ -7135,23 +7543,19 @@
 mlir/include/mlir/InitAllTranslations.h
 mlir/include/mlir/Parser.h
 mlir/include/mlir/Translation.h
-mlir/include/mlir/Analysis/AffineAnalysis.h
-mlir/include/mlir/Analysis/AffineStructures.h
 mlir/include/mlir/Analysis/BufferViewFlowAnalysis.h
 mlir/include/mlir/Analysis/DataFlowAnalysis.h
 mlir/include/mlir/Analysis/DataLayoutAnalysis.h
 mlir/include/mlir/Analysis/Liveness.h
-mlir/include/mlir/Analysis/LoopAnalysis.h
-mlir/include/mlir/Analysis/NestedMatcher.h
-mlir/include/mlir/Analysis/NumberOfExecutions.h
-mlir/include/mlir/Analysis/PresburgerSet.h
 mlir/include/mlir/Analysis/SliceAnalysis.h
-mlir/include/mlir/Analysis/Utils.h
 mlir/include/mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h
 mlir/include/mlir/Analysis/Presburger/Fraction.h
-mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
+mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
 mlir/include/mlir/Analysis/Presburger/LinearTransform.h
 mlir/include/mlir/Analysis/Presburger/Matrix.h
+mlir/include/mlir/Analysis/Presburger/PresburgerSet.h
+mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
+mlir/include/mlir/Analysis/Presburger/PWMAFunction.h
 mlir/include/mlir/Analysis/Presburger/Simplex.h
 mlir/include/mlir/Analysis/Presburger/Utils.h
 mlir/include/mlir/CAPI/AffineExpr.h
@@ -7175,7 +7579,13 @@
 mlir/include/mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h
 mlir/include/mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h
 mlir/include/mlir/Conversion/ComplexToStandard/ComplexToStandard.h
+mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
+mlir/include/mlir/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRV.h
+mlir/include/mlir/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.h
+mlir/include/mlir/Conversion/FuncToSPIRV/FuncToSPIRV.h
+mlir/include/mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h
 mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
 mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
 mlir/include/mlir/Conversion/GPUToROCDL/Runtimes.h
 mlir/include/mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h
@@ -7204,19 +7614,19 @@
 mlir/include/mlir/Conversion/OpenMPToLLVM/ConvertOpenMPToLLVM.h
 mlir/include/mlir/Conversion/PDLToPDLInterp/PDLToPDLInterp.h
 mlir/include/mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h
+mlir/include/mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h
 mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
 mlir/include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h
 mlir/include/mlir/Conversion/SCFToOpenMP/SCFToOpenMP.h
 mlir/include/mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h
 mlir/include/mlir/Conversion/SCFToSPIRV/SCFToSPIRVPass.h
-mlir/include/mlir/Conversion/SCFToStandard/SCFToStandard.h
 mlir/include/mlir/Conversion/ShapeToStandard/ShapeToStandard.h
 mlir/include/mlir/Conversion/SPIRVToLLVM/SPIRVToLLVM.h
 mlir/include/mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h
 mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
 mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h
-mlir/include/mlir/Conversion/StandardToSPIRV/StandardToSPIRV.h
-mlir/include/mlir/Conversion/StandardToSPIRV/StandardToSPIRVPass.h
+mlir/include/mlir/Conversion/TensorToSPIRV/TensorToSPIRV.h
+mlir/include/mlir/Conversion/TensorToSPIRV/TensorToSPIRVPass.h
 mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
 mlir/include/mlir/Conversion/TosaToSCF/TosaToSCF.h
 mlir/include/mlir/Conversion/TosaToStandard/TosaToStandard.h
@@ -7228,15 +7638,24 @@
 mlir/include/mlir/Conversion/VectorToSPIRV/VectorToSPIRVPass.h
 mlir/include/mlir/Dialect/CommonFolders.h
 mlir/include/mlir/Dialect/Traits.h
+mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
+mlir/include/mlir/Dialect/Affine/LoopUtils.h
 mlir/include/mlir/Dialect/Affine/Passes.h
 mlir/include/mlir/Dialect/Affine/Utils.h
+mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h
+mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
+mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+mlir/include/mlir/Dialect/Affine/Analysis/NestedMatcher.h
+mlir/include/mlir/Dialect/Affine/Analysis/Utils.h
 mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h
 mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
 mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
 mlir/include/mlir/Dialect/AMX/AMXDialect.h
 mlir/include/mlir/Dialect/AMX/Transforms.h
 mlir/include/mlir/Dialect/Arithmetic/IR/Arithmetic.h
+mlir/include/mlir/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.h
 mlir/include/mlir/Dialect/Arithmetic/Transforms/Passes.h
+mlir/include/mlir/Dialect/Arithmetic/Utils/Utils.h
 mlir/include/mlir/Dialect/ArmNeon/ArmNeonDialect.h
 mlir/include/mlir/Dialect/ArmSVE/ArmSVEDialect.h
 mlir/include/mlir/Dialect/ArmSVE/Transforms.h
@@ -7245,13 +7664,22 @@
 mlir/include/mlir/Dialect/Async/IR/Async.h
 mlir/include/mlir/Dialect/Async/IR/AsyncTypes.h
 mlir/include/mlir/Dialect/Bufferization/IR/AllocationOpInterface.h
+mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
 mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
 mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h
+mlir/include/mlir/Dialect/Bufferization/Transforms/BufferUtils.h
+mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
 mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
 mlir/include/mlir/Dialect/Complex/IR/Complex.h
+mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlow.h
+mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.h
 mlir/include/mlir/Dialect/DLTI/DLTI.h
 mlir/include/mlir/Dialect/DLTI/Traits.h
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.h
+mlir/include/mlir/Dialect/Func/IR/FuncOps.h
+mlir/include/mlir/Dialect/Func/Transforms/DecomposeCallGraphTypes.h
+mlir/include/mlir/Dialect/Func/Transforms/FuncConversions.h
+mlir/include/mlir/Dialect/Func/Transforms/Passes.h
 mlir/include/mlir/Dialect/GPU/GPUDialect.h
 mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
 mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
@@ -7260,16 +7688,10 @@
 mlir/include/mlir/Dialect/Linalg/Passes.h
 mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h
 mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/AffineInterfaceImpl.h
-mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/ArithInterfaceImpl.h
-mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/BufferizationInterfaceImpl.h
-mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/ComprehensiveBufferize.h
-mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h
 mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.h
-mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/SCFInterfaceImpl.h
-mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/TensorInterfaceImpl.h
-mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/VectorInterfaceImpl.h
 mlir/include/mlir/Dialect/Linalg/IR/Linalg.h
 mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
+mlir/include/mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h
 mlir/include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h
 mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
 mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h
@@ -7286,6 +7708,7 @@
 mlir/include/mlir/Dialect/Math/Transforms/Approximation.h
 mlir/include/mlir/Dialect/Math/Transforms/Passes.h
 mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
+mlir/include/mlir/Dialect/MemRef/Transforms/ComposeSubView.h
 mlir/include/mlir/Dialect/MemRef/Transforms/Passes.h
 mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
 mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -7300,13 +7723,16 @@
 mlir/include/mlir/Dialect/Quant/QuantOps.h
 mlir/include/mlir/Dialect/Quant/QuantTypes.h
 mlir/include/mlir/Dialect/Quant/UniformSupport.h
-mlir/include/mlir/Dialect/SCF/AffineCanonicalizationUtils.h
+mlir/include/mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h
 mlir/include/mlir/Dialect/SCF/Passes.h
 mlir/include/mlir/Dialect/SCF/SCF.h
 mlir/include/mlir/Dialect/SCF/Transforms.h
+mlir/include/mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h
+mlir/include/mlir/Dialect/SCF/Utils/Utils.h
 mlir/include/mlir/Dialect/Shape/IR/Shape.h
 mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
 mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
 mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
 mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
 mlir/include/mlir/Dialect/SPIRV/IR/ParserUtils.h
@@ -7322,27 +7748,29 @@
 mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.h
 mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h
 mlir/include/mlir/Dialect/SPIRV/Utils/LayoutUtils.h
-mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
-mlir/include/mlir/Dialect/StandardOps/Transforms/ComposeSubView.h
-mlir/include/mlir/Dialect/StandardOps/Transforms/DecomposeCallGraphTypes.h
-mlir/include/mlir/Dialect/StandardOps/Transforms/FuncConversions.h
-mlir/include/mlir/Dialect/StandardOps/Transforms/Passes.h
-mlir/include/mlir/Dialect/StandardOps/Utils/Utils.h
 mlir/include/mlir/Dialect/Tensor/IR/Tensor.h
 mlir/include/mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h
+mlir/include/mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h
+mlir/include/mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h
 mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
+mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
+mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h
 mlir/include/mlir/Dialect/Tosa/Transforms/PassDetail.h
 mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
+mlir/include/mlir/Dialect/Tosa/Utils/CoversionUtils.h
 mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h
 mlir/include/mlir/Dialect/Tosa/Utils/ShapeUtils.h
+mlir/include/mlir/Dialect/Utils/IndexingUtils.h
 mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
 mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
 mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
-mlir/include/mlir/Dialect/Vector/VectorOps.h
-mlir/include/mlir/Dialect/Vector/VectorRewritePatterns.h
-mlir/include/mlir/Dialect/Vector/VectorTransforms.h
-mlir/include/mlir/Dialect/Vector/VectorUtils.h
+mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
+mlir/include/mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h
+mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
+mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
+mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
+mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
 mlir/include/mlir/Dialect/X86Vector/Transforms.h
 mlir/include/mlir/Dialect/X86Vector/X86VectorDialect.h
 mlir/include/mlir/ExecutionEngine/AsyncRuntime.h
@@ -7350,6 +7778,7 @@
 mlir/include/mlir/ExecutionEngine/JitRunner.h
 mlir/include/mlir/ExecutionEngine/MemRefUtils.h
 mlir/include/mlir/ExecutionEngine/OptUtils.h
+mlir/include/mlir/ExecutionEngine/RunnerUtils.h
 mlir/include/mlir/ExecutionEngine/SparseTensorUtils.h
 mlir/include/mlir/Interfaces/CallInterfaces.h
 mlir/include/mlir/Interfaces/CastInterfaces.h
@@ -7376,8 +7805,7 @@
 mlir/include/mlir/IR/DialectImplementation.h
 mlir/include/mlir/IR/Dominance.h
 mlir/include/mlir/IR/FunctionImplementation.h
-mlir/include/mlir/IR/FunctionSupport.h
-mlir/include/mlir/IR/Identifier.h
+mlir/include/mlir/IR/FunctionInterfaces.h
 mlir/include/mlir/IR/ImplicitLocOpBuilder.h
 mlir/include/mlir/IR/Matchers.h
 mlir/include/mlir/IR/MLIRContext.h
@@ -7393,6 +7821,7 @@
 mlir/include/mlir/IR/TypeUtilities.h
 mlir/include/mlir/IR/Value.h
 mlir/include/mlir/IR/Verifier.h
+mlir/include/mlir/IR/Visitors.h
 mlir/include/mlir/Parser/AsmParserState.h
 mlir/include/mlir/Reducer/PassDetail.h
 mlir/include/mlir/Reducer/Passes.h
@@ -7453,17 +7882,20 @@
 mlir/include/mlir/Tools/mlir-reduce/MlirReduceMain.h
 mlir/include/mlir/Tools/PDLL/AST/Context.h
 mlir/include/mlir/Tools/PDLL/AST/Diagnostic.h
+mlir/include/mlir/Tools/PDLL/CodeGen/CPPGen.h
+mlir/include/mlir/Tools/PDLL/CodeGen/MLIRGen.h
+mlir/include/mlir/Tools/PDLL/ODS/Constraint.h
+mlir/include/mlir/Tools/PDLL/ODS/Context.h
+mlir/include/mlir/Tools/PDLL/ODS/Dialect.h
+mlir/include/mlir/Tools/PDLL/ODS/Operation.h
 mlir/include/mlir/Tools/PDLL/Parser/Parser.h
-mlir/include/mlir/Transforms/BufferUtils.h
+mlir/include/mlir/Transforms/ControlFlowSinkUtils.h
 mlir/include/mlir/Transforms/DialectConversion.h
 mlir/include/mlir/Transforms/GreedyPatternRewriteDriver.h
 mlir/include/mlir/Transforms/InliningUtils.h
 mlir/include/mlir/Transforms/LocationSnapshot.h
-mlir/include/mlir/Transforms/LoopFusionUtils.h
-mlir/include/mlir/Transforms/LoopUtils.h
 mlir/include/mlir/Transforms/Passes.h
 mlir/include/mlir/Transforms/RegionUtils.h
-mlir/include/mlir/Transforms/Utils.h
 mlir/include/mlir-c/AffineExpr.h
 mlir/include/mlir-c/AffineMap.h
 mlir/include/mlir-c/BuiltinAttributes.h
@@ -7481,44 +7913,44 @@
 mlir/include/mlir-c/Transforms.h
 mlir/include/mlir-c/Bindings/Python/Interop.h
 mlir/include/mlir-c/Dialect/Async.h
+mlir/include/mlir-c/Dialect/Func.h
 mlir/include/mlir-c/Dialect/GPU.h
 mlir/include/mlir-c/Dialect/Linalg.h
 mlir/include/mlir-c/Dialect/LLVM.h
+mlir/include/mlir-c/Dialect/PDL.h
+mlir/include/mlir-c/Dialect/Quant.h
 mlir/include/mlir-c/Dialect/SCF.h
 mlir/include/mlir-c/Dialect/Shape.h
 mlir/include/mlir-c/Dialect/SparseTensor.h
-mlir/include/mlir-c/Dialect/Standard.h
 mlir/include/mlir-c/Dialect/Tensor.h
-mlir/lib/Analysis/AffineAnalysis.cpp
-mlir/lib/Analysis/AffineStructures.cpp
 mlir/lib/Analysis/AliasAnalysis.cpp
 mlir/lib/Analysis/BufferViewFlowAnalysis.cpp
 mlir/lib/Analysis/CallGraph.cpp
 mlir/lib/Analysis/DataFlowAnalysis.cpp
 mlir/lib/Analysis/DataLayoutAnalysis.cpp
 mlir/lib/Analysis/Liveness.cpp
-mlir/lib/Analysis/LoopAnalysis.cpp
-mlir/lib/Analysis/NestedMatcher.cpp
-mlir/lib/Analysis/NumberOfExecutions.cpp
-mlir/lib/Analysis/PresburgerSet.cpp
 mlir/lib/Analysis/SliceAnalysis.cpp
-mlir/lib/Analysis/Utils.cpp
 mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
-mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
+mlir/lib/Analysis/Presburger/IntegerRelation.cpp
 mlir/lib/Analysis/Presburger/LinearTransform.cpp
 mlir/lib/Analysis/Presburger/Matrix.cpp
+mlir/lib/Analysis/Presburger/PresburgerSet.cpp
+mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
+mlir/lib/Analysis/Presburger/PWMAFunction.cpp
 mlir/lib/Analysis/Presburger/Simplex.cpp
 mlir/lib/Analysis/Presburger/Utils.cpp
 mlir/lib/Bindings/Python/AllPassesRegistration.cpp
 mlir/lib/Bindings/Python/AsyncPasses.cpp
 mlir/lib/Bindings/Python/DialectLinalg.cpp
-mlir/lib/Bindings/Python/Dialects.h
+mlir/lib/Bindings/Python/DialectPDL.cpp
+mlir/lib/Bindings/Python/DialectQuant.cpp
 mlir/lib/Bindings/Python/DialectSparseTensor.cpp
 mlir/lib/Bindings/Python/ExecutionEngineModule.cpp
 mlir/lib/Bindings/Python/Globals.h
 mlir/lib/Bindings/Python/GPUPasses.cpp
 mlir/lib/Bindings/Python/IRAffine.cpp
 mlir/lib/Bindings/Python/IRAttributes.cpp
+mlir/lib/Bindings/Python/IRCore.cpp
 mlir/lib/Bindings/Python/IRInterfaces.cpp
 mlir/lib/Bindings/Python/IRModule.cpp
 mlir/lib/Bindings/Python/IRModule.h
@@ -7536,16 +7968,18 @@
 mlir/lib/CAPI/Debug/Debug.cpp
 mlir/lib/CAPI/Dialect/Async.cpp
 mlir/lib/CAPI/Dialect/AsyncPasses.cpp
+mlir/lib/CAPI/Dialect/Func.cpp
 mlir/lib/CAPI/Dialect/GPU.cpp
 mlir/lib/CAPI/Dialect/GPUPasses.cpp
 mlir/lib/CAPI/Dialect/Linalg.cpp
 mlir/lib/CAPI/Dialect/LinalgPasses.cpp
 mlir/lib/CAPI/Dialect/LLVM.cpp
+mlir/lib/CAPI/Dialect/PDL.cpp
+mlir/lib/CAPI/Dialect/Quant.cpp
 mlir/lib/CAPI/Dialect/SCF.cpp
 mlir/lib/CAPI/Dialect/Shape.cpp
 mlir/lib/CAPI/Dialect/SparseTensor.cpp
 mlir/lib/CAPI/Dialect/SparseTensorPasses.cpp
-mlir/lib/CAPI/Dialect/Standard.cpp
 mlir/lib/CAPI/Dialect/Tensor.cpp
 mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
 mlir/lib/CAPI/Interfaces/Interfaces.cpp
@@ -7569,9 +8003,13 @@
 mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
 mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
 mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRV.cpp
+mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp
+mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRV.cpp
+mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp
 mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
 mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
-mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
 mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
 mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
 mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -7583,7 +8021,6 @@
 mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
 mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
 mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp
-mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp
 mlir/lib/Conversion/LLVMCommon/ConversionTarget.cpp
 mlir/lib/Conversion/LLVMCommon/LoweringOptions.cpp
 mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
@@ -7611,12 +8048,12 @@
 mlir/lib/Conversion/PDLToPDLInterp/RootOrdering.cpp
 mlir/lib/Conversion/PDLToPDLInterp/RootOrdering.h
 mlir/lib/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.cpp
+mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
 mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
 mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
 mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
 mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
 mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRVPass.cpp
-mlir/lib/Conversion/SCFToStandard/SCFToStandard.cpp
 mlir/lib/Conversion/ShapeToStandard/ConvertShapeConstraints.cpp
 mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp
 mlir/lib/Conversion/SPIRVCommon/Pattern.h
@@ -7624,8 +8061,8 @@
 mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
 mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.cpp
 mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
-mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.cpp
-mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRVPass.cpp
+mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp
+mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp
 mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
 mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
 mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
@@ -7634,13 +8071,17 @@
 mlir/lib/Conversion/TosaToSCF/TosaToSCFPass.cpp
 mlir/lib/Conversion/TosaToStandard/TosaToStandard.cpp
 mlir/lib/Conversion/TosaToStandard/TosaToStandardPass.cpp
-mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
 mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
 mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
 mlir/lib/Conversion/VectorToROCDL/VectorToROCDL.cpp
 mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
 mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRVPass.cpp
 mlir/lib/Dialect/Traits.cpp
+mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
+mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+mlir/lib/Dialect/Affine/Analysis/NestedMatcher.cpp
+mlir/lib/Dialect/Affine/Analysis/Utils.cpp
 mlir/lib/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp
 mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
 mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp
@@ -7648,20 +8089,25 @@
 mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp
 mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
 mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
+mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
+mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
 mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
 mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
 mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
 mlir/lib/Dialect/Affine/Transforms/PassDetail.h
+mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
 mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp
 mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
+mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
 mlir/lib/Dialect/Affine/Utils/Utils.cpp
 mlir/lib/Dialect/AMX/IR/AMXDialect.cpp
 mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
 mlir/lib/Dialect/Arithmetic/IR/ArithmeticDialect.cpp
-mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
+mlir/lib/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.cpp
 mlir/lib/Dialect/Arithmetic/Transforms/Bufferize.cpp
-mlir/lib/Dialect/Arithmetic/Transforms/ExpandOps.cpp
 mlir/lib/Dialect/Arithmetic/Transforms/PassDetail.h
+mlir/lib/Dialect/Arithmetic/Utils/Utils.cpp
 mlir/lib/Dialect/ArmNeon/IR/ArmNeonDialect.cpp
 mlir/lib/Dialect/ArmSVE/IR/ArmSVEDialect.cpp
 mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp
@@ -7673,16 +8119,27 @@
 mlir/lib/Dialect/Async/Transforms/PassDetail.cpp
 mlir/lib/Dialect/Async/Transforms/PassDetail.h
 mlir/lib/Dialect/Bufferization/IR/AllocationOpInterface.cpp
+mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
 mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp
 mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
 mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocation.cpp
 mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
+mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
+mlir/lib/Dialect/Bufferization/Transforms/BufferUtils.cpp
+mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
 mlir/lib/Dialect/Bufferization/Transforms/PassDetail.h
 mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
 mlir/lib/Dialect/Complex/IR/ComplexOps.cpp
+mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
 mlir/lib/Dialect/DLTI/DLTI.cpp
 mlir/lib/Dialect/DLTI/Traits.cpp
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+mlir/lib/Dialect/Func/IR/FuncOps.cpp
+mlir/lib/Dialect/Func/Transforms/DecomposeCallGraphTypes.cpp
+mlir/lib/Dialect/Func/Transforms/FuncBufferize.cpp
+mlir/lib/Dialect/Func/Transforms/FuncConversions.cpp
+mlir/lib/Dialect/Func/Transforms/PassDetail.h
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
 mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
 mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -7693,22 +8150,15 @@
 mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
 mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp
 mlir/lib/Dialect/Linalg/ComprehensiveBufferize/AffineInterfaceImpl.cpp
-mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ArithInterfaceImpl.cpp
-mlir/lib/Dialect/Linalg/ComprehensiveBufferize/BufferizableOpInterface.cpp
-mlir/lib/Dialect/Linalg/ComprehensiveBufferize/BufferizationInterfaceImpl.cpp
-mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ComprehensiveBufferize.cpp
-mlir/lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp
-mlir/lib/Dialect/Linalg/ComprehensiveBufferize/SCFInterfaceImpl.cpp
-mlir/lib/Dialect/Linalg/ComprehensiveBufferize/TensorInterfaceImpl.cpp
-mlir/lib/Dialect/Linalg/ComprehensiveBufferize/VectorInterfaceImpl.cpp
+mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp
 mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
 mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
 mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
 mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
 mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
 mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
-mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp
 mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
 mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
 mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp
@@ -7722,16 +8172,16 @@
 mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
 mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
 mlir/lib/Dialect/Linalg/Transforms/NamedOpConversions.cpp
+mlir/lib/Dialect/Linalg/Transforms/PadOpInterchange.cpp
 mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
 mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp
 mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
 mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
 mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp
 mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
-mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
-mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
 mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
 mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h
@@ -7744,7 +8194,11 @@
 mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
 mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+mlir/lib/Dialect/MemRef/Transforms/ExpandOps.cpp
 mlir/lib/Dialect/MemRef/Transforms/FoldSubViewOps.cpp
+mlir/lib/Dialect/MemRef/Transforms/MultiBuffer.cpp
+mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp
+mlir/lib/Dialect/MemRef/Transforms/PassDetail.h
 mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
 mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -7763,17 +8217,19 @@
 mlir/lib/Dialect/Quant/Utils/QuantizeUtils.cpp
 mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp
 mlir/lib/Dialect/SCF/SCF.cpp
-mlir/lib/Dialect/SCF/Transforms/AffineCanonicalizationUtils.cpp
+mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
 mlir/lib/Dialect/SCF/Transforms/Bufferize.cpp
 mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp
 mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
 mlir/lib/Dialect/SCF/Transforms/LoopRangeFolding.cpp
 mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
+mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
 mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
 mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
 mlir/lib/Dialect/SCF/Transforms/PassDetail.h
 mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp
-mlir/lib/Dialect/SCF/Transforms/Utils.cpp
+mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp
+mlir/lib/Dialect/SCF/Utils/Utils.cpp
 mlir/lib/Dialect/Shape/IR/Shape.cpp
 mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
 mlir/lib/Dialect/Shape/Transforms/PassDetail.h
@@ -7781,9 +8237,11 @@
 mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp
 mlir/lib/Dialect/Shape/Transforms/StructuralTypeConversions.cpp
 mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
 mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
 mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
-mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
 mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
 mlir/lib/Dialect/SPIRV/IR/SPIRVAttributes.cpp
 mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
@@ -7796,46 +8254,50 @@
 mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
 mlir/lib/Dialect/SPIRV/Transforms/PassDetail.h
 mlir/lib/Dialect/SPIRV/Transforms/RewriteInsertsPass.cpp
+mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp
 mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp
 mlir/lib/Dialect/SPIRV/Utils/LayoutUtils.cpp
-mlir/lib/Dialect/StandardOps/IR/Ops.cpp
-mlir/lib/Dialect/StandardOps/Transforms/Bufferize.cpp
-mlir/lib/Dialect/StandardOps/Transforms/ComposeSubView.cpp
-mlir/lib/Dialect/StandardOps/Transforms/DecomposeCallGraphTypes.cpp
-mlir/lib/Dialect/StandardOps/Transforms/ExpandOps.cpp
-mlir/lib/Dialect/StandardOps/Transforms/FuncBufferize.cpp
-mlir/lib/Dialect/StandardOps/Transforms/FuncConversions.cpp
-mlir/lib/Dialect/StandardOps/Transforms/PassDetail.h
-mlir/lib/Dialect/StandardOps/Transforms/TensorConstantBufferize.cpp
-mlir/lib/Dialect/StandardOps/Utils/Utils.cpp
 mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
 mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
 mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
 mlir/lib/Dialect/Tensor/Transforms/PassDetail.h
+mlir/lib/Dialect/Tensor/Transforms/SplitPadding.cpp
+mlir/lib/Dialect/Tensor/Utils/Utils.cpp
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
+mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
 mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
 mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
 mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp
-mlir/lib/Dialect/Tosa/Transforms/TosaOptimization.cpp
+mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp
+mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
 mlir/lib/Dialect/Tosa/Utils/QuantUtils.cpp
+mlir/lib/Dialect/Utils/IndexingUtils.cpp
 mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
 mlir/lib/Dialect/Utils/StaticValueUtils.cpp
 mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
-mlir/lib/Dialect/Vector/VectorDropLeadUnitDim.cpp
-mlir/lib/Dialect/Vector/VectorInsertExtractStridedSliceRewritePatterns.cpp
-mlir/lib/Dialect/Vector/VectorMultiDimReductionTransforms.cpp
-mlir/lib/Dialect/Vector/VectorOps.cpp
-mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp
-mlir/lib/Dialect/Vector/VectorTransferPermutationMapRewritePatterns.cpp
-mlir/lib/Dialect/Vector/VectorUnrollDistribute.cpp
-mlir/lib/Dialect/Vector/VectorUtils.cpp
+mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp
+mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp
+mlir/lib/Dialect/Vector/Transforms/PassDetail.h
+mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
+mlir/lib/Dialect/Vector/Transforms/VectorMultiDimReductionTransforms.cpp
+mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+mlir/lib/Dialect/Vector/Transforms/VectorTransferPermutationMapRewritePatterns.cpp
+mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
+mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+mlir/lib/Dialect/Vector/Transforms/VectorUnrollDistribute.cpp
+mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
 mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
 mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp
 mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
 mlir/lib/ExecutionEngine/AsyncRuntime.cpp
 mlir/lib/ExecutionEngine/CRunnerUtils.cpp
 mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+mlir/lib/ExecutionEngine/ExecutionEngine.cpp
 mlir/lib/ExecutionEngine/JitRunner.cpp
 mlir/lib/ExecutionEngine/OptUtils.cpp
 mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -7856,25 +8318,22 @@
 mlir/lib/IR/AffineExprDetail.h
 mlir/lib/IR/AffineMap.cpp
 mlir/lib/IR/AffineMapDetail.h
-mlir/lib/IR/AsmPrinter.cpp
 mlir/lib/IR/AttributeDetail.h
 mlir/lib/IR/Attributes.cpp
 mlir/lib/IR/Builders.cpp
 mlir/lib/IR/BuiltinAttributeInterfaces.cpp
 mlir/lib/IR/BuiltinAttributes.cpp
 mlir/lib/IR/BuiltinDialect.cpp
+mlir/lib/IR/BuiltinTypeInterfaces.cpp
 mlir/lib/IR/BuiltinTypes.cpp
-mlir/lib/IR/Diagnostics.cpp
 mlir/lib/IR/Dialect.cpp
 mlir/lib/IR/Dominance.cpp
 mlir/lib/IR/FunctionImplementation.cpp
-mlir/lib/IR/FunctionSupport.cpp
 mlir/lib/IR/IntegerSet.cpp
 mlir/lib/IR/IntegerSetDetail.h
 mlir/lib/IR/Location.cpp
 mlir/lib/IR/MLIRContext.cpp
 mlir/lib/IR/Operation.cpp
-mlir/lib/IR/OperationSupport.cpp
 mlir/lib/IR/PatternMatch.cpp
 mlir/lib/IR/Region.cpp
 mlir/lib/IR/RegionKindInterface.cpp
@@ -7888,10 +8347,6 @@
 mlir/lib/IR/Verifier.cpp
 mlir/lib/IR/Visitors.cpp
 mlir/lib/Parser/AffineParser.cpp
-mlir/lib/Parser/AsmParserImpl.h
-mlir/lib/Parser/AsmParserState.cpp
-mlir/lib/Parser/AttributeParser.cpp
-mlir/lib/Parser/DialectSymbolParser.cpp
 mlir/lib/Parser/Lexer.cpp
 mlir/lib/Parser/Lexer.h
 mlir/lib/Parser/LocationParser.cpp
@@ -7946,7 +8401,6 @@
 mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
 mlir/lib/Target/LLVMIR/DebugTranslation.cpp
 mlir/lib/Target/LLVMIR/DebugTranslation.h
-mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
 mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
 mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
 mlir/lib/Target/LLVMIR/Dialect/AMX/AMXToLLVMIRTranslation.cpp
@@ -7968,7 +8422,6 @@
 mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp
 mlir/lib/Tools/mlir-lsp-server/LSPServer.h
 mlir/lib/Tools/mlir-lsp-server/MlirLspServerMain.cpp
-mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
 mlir/lib/Tools/mlir-lsp-server/MLIRServer.h
 mlir/lib/Tools/mlir-lsp-server/lsp/Logging.cpp
 mlir/lib/Tools/mlir-lsp-server/lsp/Protocol.cpp
@@ -7980,33 +8433,31 @@
 mlir/lib/Tools/PDLL/AST/NodePrinter.cpp
 mlir/lib/Tools/PDLL/AST/TypeDetail.h
 mlir/lib/Tools/PDLL/AST/Types.cpp
+mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
+mlir/lib/Tools/PDLL/ODS/Context.cpp
+mlir/lib/Tools/PDLL/ODS/Dialect.cpp
+mlir/lib/Tools/PDLL/ODS/Operation.cpp
 mlir/lib/Tools/PDLL/Parser/Parser.cpp
-mlir/lib/Transforms/BufferOptimizations.cpp
-mlir/lib/Transforms/BufferResultsToOutParams.cpp
-mlir/lib/Transforms/BufferUtils.cpp
 mlir/lib/Transforms/Canonicalizer.cpp
+mlir/lib/Transforms/ControlFlowSink.cpp
 mlir/lib/Transforms/CSE.cpp
 mlir/lib/Transforms/Inliner.cpp
 mlir/lib/Transforms/LocationSnapshot.cpp
-mlir/lib/Transforms/LoopCoalescing.cpp
-mlir/lib/Transforms/LoopFusion.cpp
-mlir/lib/Transforms/NormalizeMemRefs.cpp
-mlir/lib/Transforms/ParallelLoopCollapsing.cpp
+mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
 mlir/lib/Transforms/PassDetail.h
-mlir/lib/Transforms/PipelineDataTransfer.cpp
 mlir/lib/Transforms/SCCP.cpp
 mlir/lib/Transforms/StripDebugInfo.cpp
 mlir/lib/Transforms/SymbolDCE.cpp
+mlir/lib/Transforms/SymbolPrivatize.cpp
+mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp
 mlir/lib/Transforms/Utils/DialectConversion.cpp
 mlir/lib/Transforms/Utils/FoldUtils.cpp
 mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
 mlir/lib/Transforms/Utils/InliningUtils.cpp
-mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
-mlir/lib/Transforms/Utils/LoopUtils.cpp
 mlir/lib/Transforms/Utils/RegionUtils.cpp
-mlir/lib/Transforms/Utils/Utils.cpp
 mlir/lib/Translation/Translation.cpp
 mlir/tools/mlir-cpu-runner/mlir-cpu-runner.cpp
+mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
 mlir/tools/mlir-lsp-server/mlir-lsp-server.cpp
 mlir/tools/mlir-opt/mlir-opt.cpp
 mlir/tools/mlir-pdll/mlir-pdll.cpp
@@ -8046,19 +8497,20 @@
 mlir/tools/mlir-vulkan-runner/vulkan-runtime-wrappers.cpp
 mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp
 mlir/tools/mlir-vulkan-runner/VulkanRuntime.h
-mlir/unittests/Analysis/AffineStructuresParser.cpp
-mlir/unittests/Analysis/AffineStructuresParser.h
-mlir/unittests/Analysis/AffineStructuresParserTest.cpp
-mlir/unittests/Analysis/AffineStructuresTest.cpp
-mlir/unittests/Analysis/PresburgerSetTest.cpp
 mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
 mlir/unittests/Analysis/Presburger/LinearTransformTest.cpp
 mlir/unittests/Analysis/Presburger/MatrixTest.cpp
+mlir/unittests/Analysis/Presburger/PresburgerSetTest.cpp
+mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp
+mlir/unittests/Analysis/Presburger/PWMAFunctionTest.cpp
 mlir/unittests/Analysis/Presburger/SimplexTest.cpp
+mlir/unittests/Analysis/Presburger/Utils.h
 mlir/unittests/Conversion/PDLToPDLInterp/RootOrderingTest.cpp
 mlir/unittests/Dialect/BroadcastShapeTest.cpp
+mlir/unittests/Dialect/Affine/Analysis/AffineStructuresParser.cpp
+mlir/unittests/Dialect/Affine/Analysis/AffineStructuresParser.h
+mlir/unittests/Dialect/Affine/Analysis/AffineStructuresParserTest.cpp
 mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
-mlir/unittests/Dialect/SCF/SCFOps.cpp
 mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
 mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
 mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
@@ -8076,6 +8528,7 @@
 mlir/unittests/IR/SubElementInterfaceTest.cpp
 mlir/unittests/Pass/AnalysisManagerTest.cpp
 mlir/unittests/Pass/PassManagerTest.cpp
+mlir/unittests/Pass/PassPipelineParserTest.cpp
 mlir/unittests/Rewrite/PatternBenefit.cpp
 mlir/unittests/Support/DebugCounterTest.cpp
 mlir/unittests/Support/IndentedOstreamTest.cpp
@@ -8105,7 +8558,6 @@
 openmp/libomptarget/DeviceRTL/include/Utils.h
 openmp/libomptarget/DeviceRTL/src/Configuration.cpp
 openmp/libomptarget/DeviceRTL/src/Kernel.cpp
-openmp/libomptarget/DeviceRTL/src/Mapping.cpp
 openmp/libomptarget/DeviceRTL/src/Misc.cpp
 openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
 openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -8113,25 +8565,10 @@
 openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
 openmp/libomptarget/DeviceRTL/src/Tasking.cpp
 openmp/libomptarget/DeviceRTL/src/Utils.cpp
-openmp/libomptarget/deviceRTLs/interface.h
-openmp/libomptarget/deviceRTLs/target_interface.h
-openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
-openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
-openmp/libomptarget/deviceRTLs/common/allocator.h
-openmp/libomptarget/deviceRTLs/common/debug.h
-openmp/libomptarget/deviceRTLs/common/omptarget.h
-openmp/libomptarget/deviceRTLs/common/omptargeti.h
-openmp/libomptarget/deviceRTLs/common/state-queue.h
-openmp/libomptarget/deviceRTLs/common/state-queuei.h
-openmp/libomptarget/deviceRTLs/common/include/target.h
-openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
-openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp
-openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
-openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
 openmp/libomptarget/include/Debug.h
 openmp/libomptarget/include/device.h
 openmp/libomptarget/include/DeviceEnvironment.h
-openmp/libomptarget/include/dlwrap.h
+openmp/libomptarget/include/interop.h
 openmp/libomptarget/include/omptarget.h
 openmp/libomptarget/include/omptargetplugin.h
 openmp/libomptarget/include/rtl.h
@@ -8154,10 +8591,8 @@
 openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
 openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
 openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
-openmp/libomptarget/plugins/cuda/src/rtl.cpp
 openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
 openmp/libomptarget/plugins/remote/include/Utils.h
-openmp/libomptarget/plugins/remote/lib/Utils.cpp
 openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp
 openmp/libomptarget/plugins/remote/server/Server.cpp
 openmp/libomptarget/plugins/remote/server/Server.h
@@ -8166,6 +8601,8 @@
 openmp/libomptarget/plugins/ve/src/rtl.cpp
 openmp/libomptarget/src/api.cpp
 openmp/libomptarget/src/interface.cpp
+openmp/libomptarget/src/interop.cpp
+openmp/libomptarget/src/omptarget.cpp
 openmp/libomptarget/src/private.h
 openmp/libomptarget/src/rtl.cpp
 openmp/libomptarget/tools/deviceinfo/llvm-omp-device-info.cpp
@@ -8202,7 +8639,6 @@
 openmp/runtime/src/kmp_itt.cpp
 openmp/runtime/src/kmp_itt.h
 openmp/runtime/src/kmp_lock.cpp
-openmp/runtime/src/kmp_lock.h
 openmp/runtime/src/kmp_omp.h
 openmp/runtime/src/kmp_platform.h
 openmp/runtime/src/kmp_safe_c_api.h
@@ -8373,9 +8809,12 @@
 polly/unittests/Support/ISLTools.cpp
 pstl/include/pstl/internal/algorithm_fwd.h
 pstl/include/pstl/internal/execution_defs.h
+pstl/include/pstl/internal/execution_impl.h
 pstl/include/pstl/internal/glue_algorithm_defs.h
+pstl/include/pstl/internal/glue_algorithm_impl.h
 pstl/include/pstl/internal/glue_execution_defs.h
 pstl/include/pstl/internal/glue_memory_defs.h
+pstl/include/pstl/internal/glue_memory_impl.h
 pstl/include/pstl/internal/glue_numeric_defs.h
 pstl/include/pstl/internal/glue_numeric_impl.h
 pstl/include/pstl/internal/numeric_fwd.h
@@ -8383,6 +8822,7 @@
 pstl/include/pstl/internal/parallel_backend_omp.h
 pstl/include/pstl/internal/parallel_backend_serial.h
 pstl/include/pstl/internal/parallel_backend_utils.h
+pstl/include/pstl/internal/parallel_impl.h
 pstl/include/pstl/internal/omp/parallel_for.h
 pstl/include/pstl/internal/omp/parallel_for_each.h
 pstl/include/pstl/internal/omp/parallel_invoke.h
diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -371,6 +371,10 @@
   /// The set of use declarations that have yet to be resolved.
   SmallVector<ModuleId, 2> UnresolvedDirectUses;
 
+  /// When \c NoUndeclaredIncludes is true, the set of modules this module tried
+  /// to import but didn't because they are not direct uses.
+  llvm::SmallSetVector<const Module *, 2> UndeclaredUses;
+
   /// A library or framework to link against when an entity from this
   /// module is used.
   struct LinkLibrary {
@@ -601,7 +605,7 @@
 
   /// Determine whether this module has declared its intention to
   /// directly use another module.
-  bool directlyUses(const Module *Requested) const;
+  bool directlyUses(const Module *Requested);
 
   /// Add the given feature requirement to the list of features
   /// required by this module.
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -2686,34 +2686,30 @@
   /// Such situations should use the specific attribute parsing functionality.
   void ParseAttributes(unsigned WhichAttrKinds,
                        ParsedAttributesWithRange &Attrs,
-                       SourceLocation *End = nullptr,
                        LateParsedAttrList *LateAttrs = nullptr);
   void ParseAttributes(unsigned WhichAttrKinds, ParsedAttributes &Attrs,
-                       SourceLocation *End = nullptr,
                        LateParsedAttrList *LateAttrs = nullptr) {
     ParsedAttributesWithRange AttrsWithRange(AttrFactory);
-    ParseAttributes(WhichAttrKinds, AttrsWithRange, End, LateAttrs);
+    ParseAttributes(WhichAttrKinds, AttrsWithRange, LateAttrs);
     Attrs.takeAllFrom(AttrsWithRange);
   }
   /// \brief Possibly parse attributes based on what syntaxes are desired,
   /// allowing for the order to vary.
   bool MaybeParseAttributes(unsigned WhichAttrKinds,
                             ParsedAttributesWithRange &Attrs,
-                            SourceLocation *End = nullptr,
                             LateParsedAttrList *LateAttrs = nullptr) {
     if (Tok.isOneOf(tok::kw___attribute, tok::kw___declspec) ||
         (standardAttributesAllowed() && isCXX11AttributeSpecifier())) {
-      ParseAttributes(WhichAttrKinds, Attrs, End, LateAttrs);
+      ParseAttributes(WhichAttrKinds, Attrs, LateAttrs);
       return true;
     }
     return false;
   }
   bool MaybeParseAttributes(unsigned WhichAttrKinds, ParsedAttributes &Attrs,
-                            SourceLocation *End = nullptr,
                             LateParsedAttrList *LateAttrs = nullptr) {
     if (Tok.isOneOf(tok::kw___attribute, tok::kw___declspec) ||
         (standardAttributesAllowed() && isCXX11AttributeSpecifier())) {
-      ParseAttributes(WhichAttrKinds, Attrs, End, LateAttrs);
+      ParseAttributes(WhichAttrKinds, Attrs, LateAttrs);
       return true;
     }
     return false;
@@ -2722,10 +2718,9 @@
   void MaybeParseGNUAttributes(Declarator &D,
                                LateParsedAttrList *LateAttrs = nullptr) {
     if (Tok.is(tok::kw___attribute)) {
-      ParsedAttributes attrs(AttrFactory);
-      SourceLocation endLoc;
-      ParseGNUAttributes(attrs, &endLoc, LateAttrs, &D);
-      D.takeAttributes(attrs, endLoc);
+      ParsedAttributesWithRange attrs(AttrFactory);
+      ParseGNUAttributes(attrs, LateAttrs, &D);
+      D.takeAttributes(attrs, attrs.Range.getEnd());
     }
   }
 
@@ -2735,11 +2730,10 @@
   /// This API is discouraged. Use the version that takes a
   /// ParsedAttributesWithRange instead.
   bool MaybeParseGNUAttributes(ParsedAttributes &Attrs,
-                               SourceLocation *EndLoc = nullptr,
                                LateParsedAttrList *LateAttrs = nullptr) {
     if (Tok.is(tok::kw___attribute)) {
       ParsedAttributesWithRange AttrsWithRange(AttrFactory);
-      ParseGNUAttributes(Attrs, EndLoc, LateAttrs);
+      ParseGNUAttributes(Attrs, LateAttrs);
       Attrs.takeAllFrom(AttrsWithRange);
       return true;
     }
@@ -2747,10 +2741,9 @@
   }
 
   bool MaybeParseGNUAttributes(ParsedAttributesWithRange &Attrs,
-                               SourceLocation *EndLoc = nullptr,
                                LateParsedAttrList *LateAttrs = nullptr) {
     if (Tok.is(tok::kw___attribute)) {
-      ParseGNUAttributes(Attrs, EndLoc, LateAttrs);
+      ParseGNUAttributes(Attrs, LateAttrs);
       return true;
     }
     return false;
@@ -2762,16 +2755,14 @@
   /// This API is discouraged. Use the version that takes a
   /// ParsedAttributesWithRange instead.
   void ParseGNUAttributes(ParsedAttributes &Attrs,
-                          SourceLocation *EndLoc = nullptr,
                           LateParsedAttrList *LateAttrs = nullptr,
                           Declarator *D = nullptr) {
     ParsedAttributesWithRange AttrsWithRange(AttrFactory);
-    ParseGNUAttributes(AttrsWithRange, EndLoc, LateAttrs, D);
+    ParseGNUAttributes(AttrsWithRange, LateAttrs, D);
     Attrs.takeAllFrom(AttrsWithRange);
   }
 
   void ParseGNUAttributes(ParsedAttributesWithRange &Attrs,
-                          SourceLocation *EndLoc = nullptr,
                           LateParsedAttrList *LateAttrs = nullptr,
                           Declarator *D = nullptr);
   void ParseGNUAttributeArgs(IdentifierInfo *AttrName,
@@ -2800,27 +2791,24 @@
   void MaybeParseCXX11Attributes(Declarator &D) {
     if (standardAttributesAllowed() && isCXX11AttributeSpecifier()) {
       ParsedAttributesWithRange attrs(AttrFactory);
-      SourceLocation endLoc;
-      ParseCXX11Attributes(attrs, &endLoc);
-      D.takeAttributes(attrs, endLoc);
+      ParseCXX11Attributes(attrs);
+      D.takeAttributes(attrs, attrs.Range.getEnd());
     }
   }
-  bool MaybeParseCXX11Attributes(ParsedAttributes &attrs,
-                                 SourceLocation *endLoc = nullptr) {
+  bool MaybeParseCXX11Attributes(ParsedAttributes &attrs) {
     if (standardAttributesAllowed() && isCXX11AttributeSpecifier()) {
       ParsedAttributesWithRange attrsWithRange(AttrFactory);
-      ParseCXX11Attributes(attrsWithRange, endLoc);
+      ParseCXX11Attributes(attrsWithRange);
       attrs.takeAllFrom(attrsWithRange);
       return true;
     }
     return false;
   }
   bool MaybeParseCXX11Attributes(ParsedAttributesWithRange &attrs,
-                                 SourceLocation *endLoc = nullptr,
                                  bool OuterMightBeMessageSend = false) {
     if (standardAttributesAllowed() &&
         isCXX11AttributeSpecifier(false, OuterMightBeMessageSend)) {
-      ParseCXX11Attributes(attrs, endLoc);
+      ParseCXX11Attributes(attrs);
       return true;
     }
     return false;
@@ -2838,8 +2826,7 @@
     ParseCXX11AttributeSpecifierInternal(Attrs, OpenMPTokens, EndLoc);
     ReplayOpenMPAttributeTokens(OpenMPTokens);
   }
-  void ParseCXX11Attributes(ParsedAttributesWithRange &attrs,
-                            SourceLocation *EndLoc = nullptr);
+  void ParseCXX11Attributes(ParsedAttributesWithRange &attrs);
   /// Parses a C++11 (or C2x)-style attribute argument list. Returns true
   /// if this results in adding an attribute to the ParsedAttributes list.
   bool ParseCXX11AttributeArgs(IdentifierInfo *AttrName,
@@ -2854,25 +2841,23 @@
       Sema::AttributeCompletion Completion = Sema::AttributeCompletion::None,
       const IdentifierInfo *EnclosingScope = nullptr);
 
-  void MaybeParseMicrosoftAttributes(ParsedAttributes &attrs,
-                                     SourceLocation *endLoc = nullptr) {
-    if (getLangOpts().MicrosoftExt && Tok.is(tok::l_square))
-      ParseMicrosoftAttributes(attrs, endLoc);
+  void MaybeParseMicrosoftAttributes(ParsedAttributes &Attrs) {
+    if (getLangOpts().MicrosoftExt && Tok.is(tok::l_square)) {
+      ParsedAttributesWithRange AttrsWithRange(AttrFactory);
+      ParseMicrosoftAttributes(AttrsWithRange);
+      Attrs.takeAllFrom(AttrsWithRange);
+    }
   }
   void ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs);
-  void ParseMicrosoftAttributes(ParsedAttributes &attrs,
-                                SourceLocation *endLoc = nullptr);
-  bool MaybeParseMicrosoftDeclSpecs(ParsedAttributes &Attrs,
-                                    SourceLocation *End = nullptr) {
-    const auto &LO = getLangOpts();
-    if (LO.DeclSpecKeyword && Tok.is(tok::kw___declspec)) {
-      ParseMicrosoftDeclSpecs(Attrs, End);
+  void ParseMicrosoftAttributes(ParsedAttributesWithRange &attrs);
+  bool MaybeParseMicrosoftDeclSpecs(ParsedAttributesWithRange &Attrs) {
+    if (getLangOpts().DeclSpecKeyword && Tok.is(tok::kw___declspec)) {
+      ParseMicrosoftDeclSpecs(Attrs);
       return true;
     }
     return false;
   }
-  void ParseMicrosoftDeclSpecs(ParsedAttributes &Attrs,
-                               SourceLocation *End = nullptr);
+  void ParseMicrosoftDeclSpecs(ParsedAttributesWithRange &Attrs);
   bool ParseMicrosoftDeclSpecArgs(IdentifierInfo *AttrName,
                                   SourceLocation AttrNameLoc,
                                   ParsedAttributes &Attrs);
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -368,7 +368,7 @@
   ExplicitSpecifier FS_explicit_specifier;
 
   // attributes.
-  ParsedAttributes Attrs;
+  ParsedAttributesWithRange Attrs;
 
   // Scope specifier for the type spec, if applicable.
   CXXScopeSpec TypeScope;
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h b/clang/include/clang/Tooling/Syntax/Pseudo/DirectiveMap.h
rename from clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
rename to clang/include/clang/Tooling/Syntax/Pseudo/DirectiveMap.h
--- a/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/DirectiveMap.h
@@ -1,4 +1,4 @@
-//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===//
+//===--- DirectiveMap.h - Find and strip preprocessor directives -*- C++-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -56,7 +56,7 @@
 ///
 /// Unlike the clang preprocessor, we model the full tree explicitly.
 /// This class does not recognize macro usage, only directives.
-struct PPStructure {
+struct DirectiveMap {
   /// A range of code (and possibly comments) containing no directives.
   struct Code {
     Token::Range Tokens;
@@ -76,7 +76,7 @@
     ///
     /// The first branch will have an #if type directive.
     /// Subsequent branches will have #else type directives.
-    std::vector<std::pair<Directive, PPStructure>> Branches;
+    std::vector<std::pair<Directive, DirectiveMap>> Branches;
     /// The directive terminating the conditional, should be #endif.
     Directive End;
   };
@@ -86,22 +86,22 @@
   std::vector<Chunk> Chunks;
 
   /// Extract preprocessor structure by examining the raw tokens.
-  static PPStructure parse(const TokenStream &);
+  static DirectiveMap parse(const TokenStream &);
 
   // FIXME: add heuristically selection of conditional branches.
   // FIXME: allow deriving a preprocessed stream
 };
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Chunk &);
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Code &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap::Chunk &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const DirectiveMap::Code &);
 llvm::raw_ostream &operator<<(llvm::raw_ostream &,
-                              const PPStructure::Directive &);
+                              const DirectiveMap::Directive &);
 llvm::raw_ostream &operator<<(llvm::raw_ostream &,
-                              const PPStructure::Conditional &);
+                              const DirectiveMap::Conditional &);
 
 // FIXME: This approximates std::variant<Code, Directive, Conditional>.
 //         Switch once we can use C++17.
-class PPStructure::Chunk {
+class DirectiveMap::Chunk {
 public:
   enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
   Kind kind() const {
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -267,7 +267,7 @@
   return llvm::makeArrayRef(TopHeaders.begin(), TopHeaders.end());
 }
 
-bool Module::directlyUses(const Module *Requested) const {
+bool Module::directlyUses(const Module *Requested) {
   auto *Top = getTopLevelModule();
 
   // A top-level module implicitly uses itself.
@@ -282,6 +282,9 @@
   if (!Requested->Parent && Requested->Name == "_Builtin_stddef_max_align_t")
     return true;
 
+  if (NoUndeclaredIncludes)
+    UndeclaredUses.insert(Requested);
+
   return false;
 }
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -15778,6 +15778,8 @@
     return Builder.CreateTrunc(LoadIntrinsic, Int16Ty);
   }
   // FMA variations
+  case PPC::BI__builtin_ppc_fnmsub:
+  case PPC::BI__builtin_ppc_fnmsubs:
   case PPC::BI__builtin_vsx_xvmaddadp:
   case PPC::BI__builtin_vsx_xvmaddasp:
   case PPC::BI__builtin_vsx_xvnmaddadp:
@@ -15816,6 +15818,8 @@
               F, {X, Y, Builder.CreateFNeg(Z, "neg")});
         else
           return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
+      case PPC::BI__builtin_ppc_fnmsub:
+      case PPC::BI__builtin_ppc_fnmsubs:
       case PPC::BI__builtin_vsx_xvnmsubadp:
       case PPC::BI__builtin_vsx_xvnmsubasp:
         if (Builder.getIsFPConstrained())
@@ -15824,10 +15828,9 @@
                   F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
               "neg");
         else
-          return Builder.CreateFNeg(
-              Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
-              "neg");
-    }
+          return Builder.CreateCall(
+              CGM.getIntrinsic(Intrinsic::ppc_fnmsub, ResultType), {X, Y, Z});
+      }
     llvm_unreachable("Unknown FMA operation");
     return nullptr; // Suppress no-return warning
   }
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -310,6 +310,8 @@
           for (; J != AnnotatedLines.begin(); --J)
             if ((*J)->Level < TheLine->Level)
               break;
+          if ((*J)->Level >= TheLine->Level)
+            return false;
 
           // Check if the found line starts a record.
           const FormatToken *LastNonComment = (*J)->Last;
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -105,7 +105,6 @@
 
 void Parser::ParseAttributes(unsigned WhichAttrKinds,
                              ParsedAttributesWithRange &Attrs,
-                             SourceLocation *End,
                              LateParsedAttrList *LateAttrs) {
   bool MoreToParse;
   do {
@@ -113,11 +112,11 @@
     // parsed, loop to ensure all specified attribute combinations are parsed.
     MoreToParse = false;
     if (WhichAttrKinds & PAKM_CXX11)
-      MoreToParse |= MaybeParseCXX11Attributes(Attrs, End);
+      MoreToParse |= MaybeParseCXX11Attributes(Attrs);
     if (WhichAttrKinds & PAKM_GNU)
-      MoreToParse |= MaybeParseGNUAttributes(Attrs, End, LateAttrs);
+      MoreToParse |= MaybeParseGNUAttributes(Attrs, LateAttrs);
     if (WhichAttrKinds & PAKM_Declspec)
-      MoreToParse |= MaybeParseMicrosoftDeclSpecs(Attrs, End);
+      MoreToParse |= MaybeParseMicrosoftDeclSpecs(Attrs);
   } while (MoreToParse);
 }
 
@@ -163,14 +162,11 @@
 ///
 /// We follow the C++ model, but don't allow junk after the identifier.
 void Parser::ParseGNUAttributes(ParsedAttributesWithRange &Attrs,
-                                SourceLocation *EndLoc,
                                 LateParsedAttrList *LateAttrs, Declarator *D) {
   assert(Tok.is(tok::kw___attribute) && "Not a GNU attribute list!");
 
-  SourceLocation StartLoc = Tok.getLocation(), Loc;
-
-  if (!EndLoc)
-    EndLoc = &Loc;
+  SourceLocation StartLoc = Tok.getLocation();
+  SourceLocation EndLoc = StartLoc;
 
   while (Tok.is(tok::kw___attribute)) {
     SourceLocation AttrTokLoc = ConsumeToken();
@@ -214,7 +210,7 @@
 
       // Handle "parameterized" attributes
       if (!LateAttrs || !isAttributeLateParsed(*AttrName)) {
-        ParseGNUAttributeArgs(AttrName, AttrNameLoc, Attrs, EndLoc, nullptr,
+        ParseGNUAttributeArgs(AttrName, AttrNameLoc, Attrs, &EndLoc, nullptr,
                               SourceLocation(), ParsedAttr::AS_GNU, D);
         continue;
       }
@@ -247,8 +243,7 @@
     SourceLocation Loc = Tok.getLocation();
     if (ExpectAndConsume(tok::r_paren))
       SkipUntil(tok::r_paren, StopAtSemi);
-    if (EndLoc)
-      *EndLoc = Loc;
+    EndLoc = Loc;
 
     // If this was declared in a macro, attach the macro IdentifierInfo to the
     // parsed attribute.
@@ -270,7 +265,7 @@
     }
   }
 
-  Attrs.Range = SourceRange(StartLoc, *EndLoc);
+  Attrs.Range = SourceRange(StartLoc, EndLoc);
 }
 
 /// Determine whether the given attribute has an identifier argument.
@@ -750,11 +745,13 @@
 /// [MS] extended-decl-modifier-seq:
 ///             extended-decl-modifier[opt]
 ///             extended-decl-modifier extended-decl-modifier-seq
-void Parser::ParseMicrosoftDeclSpecs(ParsedAttributes &Attrs,
-                                     SourceLocation *End) {
+void Parser::ParseMicrosoftDeclSpecs(ParsedAttributesWithRange &Attrs) {
   assert(getLangOpts().DeclSpecKeyword && "__declspec keyword is not enabled");
   assert(Tok.is(tok::kw___declspec) && "Not a declspec!");
 
+  SourceLocation StartLoc = Tok.getLocation();
+  SourceLocation EndLoc = StartLoc;
+
   while (Tok.is(tok::kw___declspec)) {
     ConsumeToken();
     BalancedDelimiterTracker T(*this, tok::l_paren);
@@ -817,9 +814,10 @@
                      ParsedAttr::AS_Declspec);
     }
     T.consumeClose();
-    if (End)
-      *End = T.getCloseLocation();
+    EndLoc = T.getCloseLocation();
   }
+
+  Attrs.Range = SourceRange(StartLoc, EndLoc);
 }
 
 void Parser::ParseMicrosoftTypeAttributes(ParsedAttributes &attrs) {
@@ -3663,8 +3661,7 @@
     // Attributes support.
     case tok::kw___attribute:
     case tok::kw___declspec:
-      ParseAttributes(PAKM_GNU | PAKM_Declspec, DS.getAttributes(), nullptr,
-                      LateAttrs);
+      ParseAttributes(PAKM_GNU | PAKM_Declspec, DS.getAttributes(), LateAttrs);
       continue;
 
     // Microsoft single token adornments.
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -4513,19 +4513,17 @@
 ///
 /// attribute-specifier-seq:
 ///       attribute-specifier-seq[opt] attribute-specifier
-void Parser::ParseCXX11Attributes(ParsedAttributesWithRange &attrs,
-                                  SourceLocation *endLoc) {
+void Parser::ParseCXX11Attributes(ParsedAttributesWithRange &attrs) {
   assert(standardAttributesAllowed());
 
-  SourceLocation StartLoc = Tok.getLocation(), Loc;
-  if (!endLoc)
-    endLoc = &Loc;
+  SourceLocation StartLoc = Tok.getLocation();
+  SourceLocation EndLoc = StartLoc;
 
   do {
-    ParseCXX11AttributeSpecifier(attrs, endLoc);
+    ParseCXX11AttributeSpecifier(attrs, &EndLoc);
   } while (isCXX11AttributeSpecifier());
 
-  attrs.Range = SourceRange(StartLoc, *endLoc);
+  attrs.Range = SourceRange(StartLoc, EndLoc);
 }
 
 void Parser::DiagnoseAndSkipCXX11Attributes() {
@@ -4658,10 +4656,11 @@
 /// [MS] ms-attribute-seq:
 ///             ms-attribute[opt]
 ///             ms-attribute ms-attribute-seq
-void Parser::ParseMicrosoftAttributes(ParsedAttributes &attrs,
-                                      SourceLocation *endLoc) {
+void Parser::ParseMicrosoftAttributes(ParsedAttributesWithRange &Attrs) {
   assert(Tok.is(tok::l_square) && "Not a Microsoft attribute list");
 
+  SourceLocation StartLoc = Tok.getLocation();
+  SourceLocation EndLoc = StartLoc;
   do {
     // FIXME: If this is actually a C++11 attribute, parse it as one.
     BalancedDelimiterTracker T(*this, tok::l_square);
@@ -4681,15 +4680,16 @@
       if (Tok.isNot(tok::identifier)) // ']', but also eof
         break;
       if (Tok.getIdentifierInfo()->getName() == "uuid")
-        ParseMicrosoftUuidAttributeArgs(attrs);
+        ParseMicrosoftUuidAttributeArgs(Attrs);
       else
         ConsumeToken();
     }
 
     T.consumeClose();
-    if (endLoc)
-      *endLoc = T.getCloseLocation();
+    EndLoc = T.getCloseLocation();
   } while (Tok.is(tok::l_square));
+
+  Attrs.Range = SourceRange(StartLoc, EndLoc);
 }
 
 void Parser::ParseMicrosoftIfExistsClassDeclaration(
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1252,7 +1252,7 @@
   TemplateParameterDepthRAII CurTemplateDepthTracker(TemplateParameterDepth);
   Actions.PushLambdaScope();
 
-  ParsedAttributes Attr(AttrFactory);
+  ParsedAttributesWithRange Attr(AttrFactory);
   if (getLangOpts().CUDA) {
     // In CUDA code, GNU attributes are allowed to appear immediately after the
     // "[...]", even if there is no "(...)" before the lambda body.
@@ -1355,7 +1355,8 @@
           DeclEndLoc = ESpecRange.getEnd();
 
         // Parse attribute-specifier[opt].
-        MaybeParseCXX11Attributes(Attr, &DeclEndLoc);
+        if (MaybeParseCXX11Attributes(Attr))
+          DeclEndLoc = Attr.Range.getEnd();
 
         // Parse OpenCL addr space attribute.
         if (Tok.isOneOf(tok::kw___private, tok::kw___global, tok::kw___local,
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -341,7 +341,7 @@
                     Token &FirstToken) override;
 
   /// A pool of attributes that were parsed in \#pragma clang attribute.
-  ParsedAttributes AttributesForPragmaAttribute;
+  ParsedAttributesWithRange AttributesForPragmaAttribute;
 };
 
 struct PragmaMaxTokensHereHandler : public PragmaHandler {
@@ -1365,12 +1365,13 @@
 namespace {
 struct PragmaAttributeInfo {
   enum ActionType { Push, Pop, Attribute };
-  ParsedAttributes &Attributes;
+  ParsedAttributesWithRange &Attributes;
   ActionType Action;
   const IdentifierInfo *Namespace = nullptr;
   ArrayRef<Token> Tokens;
 
-  PragmaAttributeInfo(ParsedAttributes &Attributes) : Attributes(Attributes) {}
+  PragmaAttributeInfo(ParsedAttributesWithRange &Attributes)
+      : Attributes(Attributes) {}
 };
 
 #include "clang/Parse/AttrSubMatchRulesParserStringSwitches.inc"
@@ -1640,7 +1641,7 @@
                       /*IsReinject=*/false);
   ConsumeAnnotationToken();
 
-  ParsedAttributes &Attrs = Info->Attributes;
+  ParsedAttributesWithRange &Attrs = Info->Attributes;
   Attrs.clearListOnly();
 
   auto SkipToEnd = [this]() {
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -106,7 +106,7 @@
   // at the start of the statement. Thus, we're not using MaybeParseAttributes
   // here because we don't want to allow arbitrary orderings.
   ParsedAttributesWithRange Attrs(AttrFactory);
-  MaybeParseCXX11Attributes(Attrs, nullptr, /*MightBeObjCMessageSend*/ true);
+  MaybeParseCXX11Attributes(Attrs, /*MightBeObjCMessageSend*/ true);
   if (getLangOpts().OpenCL)
     MaybeParseGNUAttributes(Attrs);
 
@@ -1119,8 +1119,7 @@
         ConsumeToken();
 
       ParsedAttributesWithRange attrs(AttrFactory);
-      MaybeParseCXX11Attributes(attrs, nullptr,
-                                /*MightBeObjCMessageSend*/ true);
+      MaybeParseCXX11Attributes(attrs, /*MightBeObjCMessageSend*/ true);
 
       // If this is the start of a declaration, parse it as such.
       if (isDeclarationStatement()) {
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -1913,7 +1913,7 @@
                                   /*OuterMightBeMessageSend*/true))
       return TPResult::True;
 
-    ParsedAttributes attrs(AttrFactory);
+    ParsedAttributesWithRange attrs(AttrFactory);
     MaybeParseMicrosoftAttributes(attrs);
 
     // decl-specifier-seq
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -164,8 +164,8 @@
 std::set<const FileEntry *> GetAllModuleMaps(const HeaderSearch &HS,
                                              Module *RootModule) {
   std::set<const FileEntry *> ModuleMaps{};
-  std::set<Module *> ProcessedModules;
-  SmallVector<Module *> ModulesToProcess{RootModule};
+  std::set<const Module *> ProcessedModules;
+  SmallVector<const Module *> ModulesToProcess{RootModule};
 
   SmallVector<const FileEntry *, 16> FilesByUID;
   HS.getFileMgr().GetUniqueIDMapping(FilesByUID);
@@ -209,6 +209,11 @@
       }
       ModulesToProcess.push_back(ImportedModule);
     }
+
+    for (const Module *UndeclaredModule : CurrentModule->UndeclaredUses)
+      if (UndeclaredModule &&
+          ProcessedModules.find(UndeclaredModule) == ProcessedModules.end())
+        ModulesToProcess.push_back(UndeclaredModule);
   }
 
   return ModuleMaps;
@@ -2861,6 +2866,8 @@
     // Might be unnecessary as use declarations are only used to build the
     // module itself.
 
+    // TODO: Consider serializing undeclared uses of modules.
+
     // Emit the link libraries.
     for (const auto &LL : Mod->LinkLibraries) {
       RecordData::value_type Record[] = {SUBMODULE_LINK_LIBRARY,
diff --git a/clang/lib/StaticAnalyzer/Checkers/DeadStoresChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DeadStoresChecker.cpp
--- a/clang/lib/StaticAnalyzer/Checkers/DeadStoresChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DeadStoresChecker.cpp
@@ -107,11 +107,8 @@
       dyn_cast<BinaryOperator>(Ex->IgnoreParenCasts());
     if (!BO)
       break;
-    if (BO->getOpcode() == BO_Assign) {
-      Ex = BO->getRHS();
-      continue;
-    }
-    if (BO->getOpcode() == BO_Comma) {
+    BinaryOperatorKind Op = BO->getOpcode();
+    if (Op == BO_Assign || Op == BO_Comma) {
       Ex = BO->getRHS();
       continue;
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
--- a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -580,7 +580,9 @@
       {{"fgetln"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{"fgets"}, TR::Prop({{2}}, {{0, ReturnValueIndex}})},
       {{"fscanf"}, TR::Prop({{0}}, {{}, 2})},
+      {{"fscanf_s"}, TR::Prop({{0}}, {{}, {2}})},
       {{"sscanf"}, TR::Prop({{0}}, {{}, 2})},
+
       {{"getc"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{"getc_unlocked"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{"getdelim"}, TR::Prop({{3}}, {{0}})},
@@ -592,6 +594,78 @@
       {{"strrchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{"tolower"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{"toupper"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"fread"}, TR::Prop({{3}}, {{0, ReturnValueIndex}})},
+      {{"recv"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
+      {{"recvfrom"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
+
+      {{"ttyname"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"ttyname_r"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
+
+      {{"basename"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"dirname"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"fnmatch"}, TR::Prop({{1}}, {{ReturnValueIndex}})},
+      {{"memchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"memrchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"rawmemchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+
+      {{"mbtowc"}, TR::Prop({{1}}, {{0, ReturnValueIndex}})},
+      {{"wctomb"}, TR::Prop({{1}}, {{0, ReturnValueIndex}})},
+      {{"wcwidth"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+
+      {{"memcmp"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
+      {{"memcpy"}, TR::Prop({{1}}, {{0, ReturnValueIndex}})},
+      {{"memmove"}, TR::Prop({{1}}, {{0, ReturnValueIndex}})},
+      // If memmem was called with a tainted needle and the search was
+      // successful, that would mean that the value pointed by the return value
+      // has the same content as the needle. If we choose to go by the policy of
+      // content equivalence implies taintedness equivalence, that would mean
+      // haystack should be considered a propagation source argument.
+      {{"memmem"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+
+      // The comment for memmem above also applies to strstr.
+      {{"strstr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"strcasestr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+
+      {{"strchrnul"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+
+      {{"index"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"rindex"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+
+      // FIXME: In case of arrays, only the first element of the array gets
+      // tainted.
+      {{"qsort"}, TR::Prop({{0}}, {{0}})},
+      {{"qsort_r"}, TR::Prop({{0}}, {{0}})},
+
+      {{"strcmp"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
+      {{"strcasecmp"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
+      {{"strncmp"}, TR::Prop({{0, 1, 2}}, {{ReturnValueIndex}})},
+      {{"strncasecmp"}, TR::Prop({{0, 1, 2}}, {{ReturnValueIndex}})},
+      {{"strspn"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
+      {{"strcspn"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
+      {{"strpbrk"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"strndup"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"strndupa"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"strlen"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"strnlen"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"strtol"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
+      {{"strtoll"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
+      {{"strtoul"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
+      {{"strtoull"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
+
+      {{"isalnum"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isalpha"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isascii"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isblank"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"iscntrl"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isdigit"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isgraph"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"islower"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isprint"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"ispunct"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isspace"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isupper"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{"isxdigit"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+
       {{CDF_MaybeBuiltin, {BI.getName(Builtin::BIstrncat)}},
        TR::Prop({{1, 2}}, {{0, ReturnValueIndex}})},
       {{CDF_MaybeBuiltin, {BI.getName(Builtin::BIstrlcpy)}},
@@ -927,7 +1001,6 @@
 }
 
 /// Checker registration
-
 void ento::registerGenericTaintChecker(CheckerManager &Mgr) {
   Mgr.registerChecker<GenericTaintChecker>();
 }
diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -1,13 +1,13 @@
 set(LLVM_LINK_COMPONENTS Support)
 
 add_clang_library(clangToolingSyntaxPseudo
+  DirectiveMap.cpp
   Grammar.cpp
   GrammarBNF.cpp
   Lex.cpp
   LRGraph.cpp
   LRTable.cpp
   LRTableBuild.cpp
-  Preprocess.cpp
   Token.cpp
 
   LINK_LIBS
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp b/clang/lib/Tooling/Syntax/Pseudo/DirectiveMap.cpp
rename from clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
rename to clang/lib/Tooling/Syntax/Pseudo/DirectiveMap.cpp
--- a/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/DirectiveMap.cpp
@@ -1,4 +1,4 @@
-//===--- Preprocess.cpp - Preprocess token streams ------------------------===//
+//===--- DirectiveMap.cpp - Find and strip preprocessor directives --------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/TokenKinds.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -16,10 +16,11 @@
 namespace pseudo {
 namespace {
 
-class PPParser {
+class DirectiveParser {
 public:
-  explicit PPParser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {}
-  void parse(PPStructure *Result) { parse(Result, /*TopLevel=*/true); }
+  explicit DirectiveParser(const TokenStream &Code)
+      : Code(Code), Tok(&Code.front()) {}
+  void parse(DirectiveMap *Result) { parse(Result, /*TopLevel=*/true); }
 
 private:
   // Roles that a directive might take within a conditional block.
@@ -42,10 +43,11 @@
     }
   }
 
-  // Parses tokens starting at Tok into PP.
-  // If we reach an End or Else directive that ends PP, returns it.
+  // Parses tokens starting at Tok into Map.
+  // If we reach an End or Else directive that ends Map, returns it.
   // If TopLevel is true, then we do not expect End and always return None.
-  llvm::Optional<PPStructure::Directive> parse(PPStructure *PP, bool TopLevel) {
+  llvm::Optional<DirectiveMap::Directive> parse(DirectiveMap *Map,
+                                                bool TopLevel) {
     auto StartsDirective =
         [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
           if (Tok->flag(LexFlags::StartsPPLine)) {
@@ -65,29 +67,29 @@
         do
           ++Tok;
         while (Tok->Kind != tok::eof && !StartsDirective());
-        PP->Chunks.push_back(PPStructure::Code{
+        Map->Chunks.push_back(DirectiveMap::Code{
             Token::Range{Code.index(*Start), Code.index(*Tok)}});
         continue;
       }
 
       // We have some kind of directive.
-      PPStructure::Directive Directive;
+      DirectiveMap::Directive Directive;
       parseDirective(&Directive);
       Cond Kind = classifyDirective(Directive.Kind);
       if (Kind == Cond::If) {
         // #if or similar, starting a nested conditional block.
-        PPStructure::Conditional Conditional;
+        DirectiveMap::Conditional Conditional;
         Conditional.Branches.emplace_back();
         Conditional.Branches.back().first = std::move(Directive);
         parseConditional(&Conditional);
-        PP->Chunks.push_back(std::move(Conditional));
+        Map->Chunks.push_back(std::move(Conditional));
       } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
-        // #endif or similar, ending this PPStructure scope.
+        // #endif or similar, ending this PStructure scope.
         // (#endif is unexpected at the top level, treat as simple directive).
         return std::move(Directive);
       } else {
         // #define or similar, a simple directive at the current scope.
-        PP->Chunks.push_back(std::move(Directive));
+        Map->Chunks.push_back(std::move(Directive));
       }
     }
     return None;
@@ -95,7 +97,7 @@
 
   // Parse the rest of a conditional section, after seeing the If directive.
   // Returns after consuming the End directive.
-  void parseConditional(PPStructure::Conditional *C) {
+  void parseConditional(DirectiveMap::Conditional *C) {
     assert(C->Branches.size() == 1 &&
            C->Branches.front().second.Chunks.empty() &&
            "Should be ready to parse first branch body");
@@ -118,7 +120,7 @@
   }
 
   // Parse a directive. Tok is the hash.
-  void parseDirective(PPStructure::Directive *D) {
+  void parseDirective(DirectiveMap::Directive *D) {
     assert(Tok->Kind == tok::hash);
 
     // Directive spans from the hash until the end of line or file.
@@ -142,25 +144,26 @@
 
 } // namespace
 
-PPStructure PPStructure::parse(const TokenStream &Code) {
-  PPStructure Result;
-  PPParser(Code).parse(&Result);
+DirectiveMap DirectiveMap::parse(const TokenStream &Code) {
+  DirectiveMap Result;
+  DirectiveParser(Code).parse(&Result);
   return Result;
 }
 
-static void dump(llvm::raw_ostream &OS, const PPStructure &, unsigned Indent);
-static void dump(llvm::raw_ostream &OS, const PPStructure::Directive &Directive,
-                 unsigned Indent) {
+static void dump(llvm::raw_ostream &OS, const DirectiveMap &, unsigned Indent);
+static void dump(llvm::raw_ostream &OS,
+                 const DirectiveMap::Directive &Directive, unsigned Indent) {
   OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
                                      tok::getPPKeywordSpelling(Directive.Kind),
                                      Directive.Tokens.size());
 }
-static void dump(llvm::raw_ostream &OS, const PPStructure::Code &Code,
+static void dump(llvm::raw_ostream &OS, const DirectiveMap::Code &Code,
                  unsigned Indent) {
   OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
 }
 static void dump(llvm::raw_ostream &OS,
-                 const PPStructure::Conditional &Conditional, unsigned Indent) {
+                 const DirectiveMap::Conditional &Conditional,
+                 unsigned Indent) {
   for (const auto &Branch : Conditional.Branches) {
     dump(OS, Branch.first, Indent);
     dump(OS, Branch.second, Indent + 2);
@@ -168,23 +171,23 @@
   dump(OS, Conditional.End, Indent);
 }
 
-static void dump(llvm::raw_ostream &OS, const PPStructure::Chunk &Chunk,
+static void dump(llvm::raw_ostream &OS, const DirectiveMap::Chunk &Chunk,
                  unsigned Indent) {
   switch (Chunk.kind()) {
-  case PPStructure::Chunk::K_Empty:
+  case DirectiveMap::Chunk::K_Empty:
     llvm_unreachable("invalid chunk");
-  case PPStructure::Chunk::K_Code:
-    return dump(OS, (const PPStructure::Code &)Chunk, Indent);
-  case PPStructure::Chunk::K_Directive:
-    return dump(OS, (const PPStructure::Directive &)Chunk, Indent);
-  case PPStructure::Chunk::K_Conditional:
-    return dump(OS, (const PPStructure::Conditional &)Chunk, Indent);
+  case DirectiveMap::Chunk::K_Code:
+    return dump(OS, (const DirectiveMap::Code &)Chunk, Indent);
+  case DirectiveMap::Chunk::K_Directive:
+    return dump(OS, (const DirectiveMap::Directive &)Chunk, Indent);
+  case DirectiveMap::Chunk::K_Conditional:
+    return dump(OS, (const DirectiveMap::Conditional &)Chunk, Indent);
   }
 }
 
-static void dump(llvm::raw_ostream &OS, const PPStructure &PP,
+static void dump(llvm::raw_ostream &OS, const DirectiveMap &Map,
                  unsigned Indent) {
-  for (const auto &Chunk : PP.Chunks)
+  for (const auto &Chunk : Map.Chunks)
     dump(OS, Chunk, Indent);
 }
 
@@ -194,11 +197,11 @@
     dump(OS, T, 0);                                                            \
     return OS;                                                                 \
   }
-OSTREAM_DUMP(PPStructure)
-OSTREAM_DUMP(PPStructure::Chunk)
-OSTREAM_DUMP(PPStructure::Directive)
-OSTREAM_DUMP(PPStructure::Conditional)
-OSTREAM_DUMP(PPStructure::Code)
+OSTREAM_DUMP(DirectiveMap)
+OSTREAM_DUMP(DirectiveMap::Chunk)
+OSTREAM_DUMP(DirectiveMap::Directive)
+OSTREAM_DUMP(DirectiveMap::Conditional)
+OSTREAM_DUMP(DirectiveMap::Code)
 #undef OSTREAM_DUMP
 
 } // namespace pseudo
diff --git a/clang/lib/Tooling/Syntax/Pseudo/README.md b/clang/lib/Tooling/Syntax/Pseudo/README.md
new file mode 100644
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/README.md
@@ -0,0 +1,37 @@
+# clang pseudoparser
+
+This directory implements an approximate heuristic parser for C++, based on the
+clang lexer, the C++ grammar, and the GLR parsing algorithm.
+
+It parses a file in isolation, without reading its included headers.
+The result is a strict syntactic tree whose structure follows the C++ grammar.
+There is no semantic analysis, apart from guesses to disambiguate the parse.
+Disambiguation can optionally be guided by an AST or a symbol index.
+
+For now, the best reference on intended scope is the [design proposal],
+with further discussion on the [RFC].
+
+## Dependencies between pseudoparser and clang
+
+Dependencies are limited because they don't make sense, but also to avoid
+placing a burden on clang mantainers.
+
+The pseudoparser reuses the clang lexer (clangLex and clangBasic libraries) but
+not the higher-level libraries (Parse, Sema, AST, Frontend...).
+
+When the pseudoparser should be used together with an AST (e.g. to guide
+disambiguation), this is a separate "bridge" library that depends on both.
+
+Clang does not depend on the pseudoparser at all. If this seems useful in future
+it should be discussed by RFC.
+
+## Parity between pseudoparser and clang
+
+The pseudoparser aims to understand real-world code, and particularly the
+languages and extensions supported by Clang.
+
+However we don't try to keep these in lockstep: there's no expectation that
+Clang parser changes are accompanied by pseudoparser changes or vice versa.
+
+[design proposal]: https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit
+[RFC]: https://discourse.llvm.org/t/rfc-a-c-pseudo-parser-for-tooling/59217/49
diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -1,20 +1,26 @@
-// RUN: %clang_analyze_cc1 -Wno-format-security -Wno-pointer-to-int-cast -verify %s \
+// RUN: %clang_analyze_cc1 -Wno-format-security -Wno-pointer-to-int-cast \
+// RUN:   -Wno-incompatible-library-redeclaration -verify %s \
 // RUN:   -analyzer-checker=alpha.security.taint \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=alpha.security.ArrayBoundV2 \
+// RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config \
 // RUN:     alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config.yaml
 
-// RUN: %clang_analyze_cc1 -Wno-format-security -Wno-pointer-to-int-cast -verify %s \
+// RUN: %clang_analyze_cc1 -Wno-format-security -Wno-pointer-to-int-cast \
+// RUN:   -Wno-incompatible-library-redeclaration -verify %s \
 // RUN:   -DFILE_IS_STRUCT \
 // RUN:   -analyzer-checker=alpha.security.taint \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=alpha.security.ArrayBoundV2 \
+// RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config \
 // RUN:     alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config.yaml
 
-// RUN: not %clang_analyze_cc1 -Wno-pointer-to-int-cast -verify %s \
+// RUN: not %clang_analyze_cc1 -Wno-pointer-to-int-cast \
+// RUN:   -Wno-incompatible-library-redeclaration -verify %s \
 // RUN:   -analyzer-checker=alpha.security.taint \
+// RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config \
 // RUN:     alpha.security.taint.TaintPropagation:Config=justguessit \
 // RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-FILE
@@ -24,8 +30,10 @@
 // CHECK-INVALID-FILE-SAME:        that expects a valid filename instead of
 // CHECK-INVALID-FILE-SAME:        'justguessit'
 
-// RUN: not %clang_analyze_cc1 -verify %s \
+// RUN: not %clang_analyze_cc1 -Wno-incompatible-library-redeclaration \
+// RUN:   -verify %s \
 // RUN:   -analyzer-checker=alpha.security.taint \
+// RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config \
 // RUN:     alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config-ill-formed.yaml \
 // RUN:   2>&1 | FileCheck -DMSG=%errc_EINVAL %s -check-prefix=CHECK-ILL-FORMED
@@ -34,8 +42,10 @@
 // CHECK-ILL-FORMED-SAME:        'alpha.security.taint.TaintPropagation:Config',
 // CHECK-ILL-FORMED-SAME:        that expects a valid yaml file: [[MSG]]
 
-// RUN: not %clang_analyze_cc1 -verify %s \
+// RUN: not %clang_analyze_cc1 -Wno-incompatible-library-redeclaration \
+// RUN:   -verify %s \
 // RUN:   -analyzer-checker=alpha.security.taint \
+// RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config \
 // RUN:     alpha.security.taint.TaintPropagation:Config=%S/Inputs/taint-generic-config-invalid-arg.yaml \
 // RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-ARG
@@ -46,6 +56,9 @@
 // CHECK-INVALID-ARG-SAME:        rules greater or equal to -1
 
 typedef long long rsize_t;
+void clang_analyzer_isTainted_char(char);
+void clang_analyzer_isTainted_charp(char*);
+void clang_analyzer_isTainted_int(int);
 
 int scanf(const char *restrict format, ...);
 char *gets(char *str);
@@ -60,13 +73,18 @@
 #endif
 
 #define bool _Bool
+#define NULL (void*)0
 
 char *getenv(const char *name);
+
+FILE *fopen(const char *name, const char *mode);
+
 int fscanf(FILE *restrict stream, const char *restrict format, ...);
 int sprintf(char *str, const char *format, ...);
 void setproctitle(const char *fmt, ...);
 void setproctitle_init(int argc, char *argv[], char *envp[]);
 typedef __typeof(sizeof(int)) size_t;
+typedef signed long long ssize_t;
 
 // Define string functions. Use builtin for some of them. They all default to
 // the processing in the taint checker.
@@ -87,6 +105,13 @@
 void *calloc(size_t nmemb, size_t size);
 void bcopy(void *s1, void *s2, size_t n);
 
+typedef size_t socklen_t;
+
+struct sockaddr {
+  unsigned short sa_family;
+  char sa_data[14];
+};
+
 #define BUFSIZE 10
 
 int Buffer[BUFSIZE];
@@ -388,7 +413,6 @@
   return system(c); // expected-warning {{Untrusted data is passed to a system call}}
 }
 
-typedef signed long long ssize_t;
 ssize_t readlink(const char *path, char *buf, size_t bufsiz);
 int testReadlink(char *path, char *buf, size_t bufsiz) {
   ssize_t s = readlink(path, buf, bufsiz);
@@ -420,8 +444,6 @@
   return system(name); // expected-warning {{Untrusted data is passed to a system call}}
 }
 
-struct sockaddr;
-typedef size_t socklen_t;
 int getnameinfo(const struct sockaddr *restrict addr, socklen_t addrlen,
                 char *restrict host, socklen_t hostlen,
                 char *restrict serv, socklen_t servlen, int flags);
@@ -463,6 +485,503 @@
   return system(buf); // expected-warning {{Untrusted data is passed to a system call}}
 }
 
+int fscanf_s(FILE *stream, const char *format, ...);
+void testFscanf_s(const char *fname, int *d) {
+  FILE *f = fopen(fname, "r");
+  fscanf_s(f, "%d", d);
+  clang_analyzer_isTainted_int(*d); // expected-warning {{YES}}
+}
+
+int fread(void *buffer, size_t size, size_t count, FILE *stream);
+void testFread(const char *fname, int *buffer, size_t size, size_t count) {
+  FILE *f = fopen(fname, "r");
+  size_t read = fread(buffer, size, count, f);
+
+  clang_analyzer_isTainted_int(*buffer); // expected-warning {{YES}}
+  clang_analyzer_isTainted_int(read); // expected-warning {{YES}}
+}
+
+ssize_t recv(int sockfd, void *buf, size_t len, int flags);
+void testRecv(int *buf, size_t len, int flags) {
+  int fd;
+  scanf("%d", &fd); // fake a tainted a file descriptor
+
+  size_t read = recv(fd, buf, len, flags);
+  clang_analyzer_isTainted_int(*buf); // expected-warning {{YES}}
+  clang_analyzer_isTainted_int(read); // expected-warning {{YES}}
+}
+
+ssize_t recvfrom(int sockfd, void *restrict buf, size_t len, int flags,
+                 struct sockaddr *restrict src_addr,
+                 socklen_t *restrict addrlen);
+void testRecvfrom(int *restrict buf, size_t len, int flags,
+                 struct sockaddr *restrict src_addr,
+                 socklen_t *restrict addrlen) {
+  int fd;
+  scanf("%d", &fd); // fake a tainted a file descriptor
+
+  size_t read = recvfrom(fd, buf, len, flags, src_addr, addrlen);
+  clang_analyzer_isTainted_int(*buf); // expected-warning {{YES}}
+  clang_analyzer_isTainted_int(read); // expected-warning {{YES}}
+}
+
+char *ttyname(int fd);
+void testTtyname() {
+  int fd;
+  scanf("%d", &fd); // fake a tainted a file descriptor
+
+  char *name = ttyname(fd);
+  clang_analyzer_isTainted_charp(name); // expected-warning {{YES}}
+}
+
+int ttyname_r(int fd, char *buf, size_t buflen);
+void testTtyname_r(char *buf, size_t buflen) {
+  int fd;
+  scanf("%d", &fd); // fake a tainted a file descriptor
+
+  int result = ttyname_r(fd, buf, buflen);
+  clang_analyzer_isTainted_char(*buf); // expected-warning {{YES}}
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+char *dirname(char *path);
+void testDirname() {
+  char buf[10];
+  scanf("%9s", buf);
+
+  char *name = dirname(buf);
+  clang_analyzer_isTainted_charp(name); // expected-warning {{YES}}
+}
+
+char *basename(char *path);
+void testBasename() {
+  char buf[10];
+  scanf("%9s", buf);
+
+  char *name = basename(buf);
+  clang_analyzer_isTainted_charp(name); // expected-warning {{YES}}
+}
+
+int fnmatch(const char *pattern, const char *string, int flags);
+void testFnmatch(const char *pattern, int flags) {
+  char string[10];
+  scanf("%9s", string);
+
+  int result = fnmatch(pattern, string, flags);
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+void *memchr(const void *s, int c, size_t n);
+void testMemchr(int c, size_t n) {
+  char buf[10];
+  scanf("%9s", buf);
+
+  char *result = memchr(buf, c, n);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+void *memrchr(const void *s, int c, size_t n);
+void testMemrchr(int c, size_t n) {
+  char buf[10];
+  scanf("%9s", buf);
+
+  char *result = memrchr(buf, c, n);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+void *rawmemchr(const void *s, int c);
+void testRawmemchr(int c) {
+  char buf[10];
+  scanf("%9s", buf);
+
+  char *result = rawmemchr(buf, c);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+typedef char wchar_t;
+int mbtowc(wchar_t *pwc, const char *s, size_t n);
+void testMbtowc(wchar_t *pwc, size_t n) {
+  char buf[10];
+  scanf("%9s", buf);
+
+  int result = mbtowc(pwc, buf, n);
+  clang_analyzer_isTainted_char(*pwc); // expected-warning {{YES}}
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+int wctomb(char *s, wchar_t wc);
+void testWctomb(char *buf) {
+  wchar_t wc;
+  scanf("%c", &wc);
+
+  int result = wctomb(buf, wc);
+  clang_analyzer_isTainted_char(*buf); // expected-warning {{YES}}
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+int wcwidth(wchar_t c);
+void testWcwidth() {
+  wchar_t wc;
+  scanf("%c", &wc);
+
+  int width = wcwidth(wc);
+  clang_analyzer_isTainted_int(width); // expected-warning {{YES}}
+}
+
+int memcmp(const void *s1, const void *s2, size_t n);
+void testMemcmpWithLHSTainted(size_t n, char *rhs) {
+  char lhs[10];
+  scanf("%9s", lhs);
+
+  int cmp_result = memcmp(lhs, rhs, n);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+void testMemcmpWithRHSTainted(size_t n, char *lhs) {
+  char rhs[10];
+  scanf("%9s", rhs);
+
+  int cmp_result = memcmp(lhs, rhs, n);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+void *memcpy(void *restrict dest, const void *restrict src, size_t n);
+void testMemcpy(char *dst, size_t n) {
+  char src[10];
+  scanf("%9s", src);
+
+  char *result = memcpy(dst, src, n);
+
+  clang_analyzer_isTainted_char(*dst); // expected-warning {{YES}}
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+void *memmove(void *dest, const void *src, size_t n);
+void testMemmove(char *dst, size_t n) {
+  char src[10];
+  scanf("%9s", src);
+
+  char *result = memmove(dst, src, n);
+
+  clang_analyzer_isTainted_char(*dst); // expected-warning {{YES}}
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+void *memmem(const void *haystack, size_t haystacklen, const void *needle, size_t needlelen);
+void testMemmem(const void *needle, size_t needlelen) {
+  char haystack[10];
+  scanf("%9s", haystack);
+
+  char *result = memmem(haystack, 9, needle, needlelen);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+char *strstr(const char *haystack, const char *needle);
+void testStrstr(const char *needle) {
+  char haystack[10];
+  scanf("%9s", haystack);
+
+  char *result = strstr(haystack, needle);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+char *strcasestr(const char *haystack, const char *needle);
+void testStrcasestr(const char *needle) {
+  char haystack[10];
+  scanf("%9s", haystack);
+
+  char *result = strcasestr(haystack, needle);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+char *strchrnul(const char *s, int c);
+void testStrchrnul() {
+  char s[10];
+  scanf("%9s", s);
+
+  char *result = strchrnul(s, 9);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+char *index(const char *s, int c);
+void testIndex() {
+  char s[10];
+  scanf("%9s", s);
+
+  char *result = index(s, 9);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+char *rindex(const char *s, int c);
+void testRindex() {
+  char s[10];
+  scanf("%9s", s);
+
+  char *result = rindex(s, 9);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+int strcmp(const char *s1, const char *s2);
+void testStrcmpWithLHSTainted(char *rhs) {
+  char lhs[10];
+  scanf("%9s", lhs);
+
+  int cmp_result = strcmp(lhs, rhs);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+void testStrcmpWithRHSTainted(char *lhs) {
+  char rhs[10];
+  scanf("%9s", rhs);
+
+  int cmp_result = strcmp(lhs, rhs);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+int strcasecmp(const char *s1, const char *s2);
+void testStrcasecmpWithLHSTainted(char *rhs) {
+  char lhs[10];
+  scanf("%9s", lhs);
+
+  int cmp_result = strcasecmp(lhs, rhs);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+void testStrcasecmpWithRHSTainted(char *lhs) {
+  char rhs[10];
+  scanf("%9s", rhs);
+
+  int cmp_result = strcasecmp(lhs, rhs);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+int strncmp(const char *s1, const char *s2, size_t n);
+void testStrncmpWithLHSTainted(char *rhs, size_t n) {
+  char lhs[10];
+  scanf("%9s", lhs);
+
+  int cmp_result = strncmp(lhs, rhs, n);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+void testStrncmpWithRHSTainted(char *lhs, size_t n) {
+  char rhs[10];
+  scanf("%9s", rhs);
+
+  int cmp_result = strncmp(lhs, rhs, n);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+void testStrncmpWithNTainted(char *lhs, char *rhs) {
+  int n;
+  scanf("%d", &n);
+
+  int cmp_result = strncmp(lhs, rhs, n);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+int strncasecmp(const char *s1, const char *s2, size_t n);
+void testStrncasecmpWithLHSTainted(char *rhs, size_t n) {
+  char lhs[10];
+  scanf("%9s", lhs);
+
+  int cmp_result = strncmp(lhs, rhs, n);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+void testStrncasecmpWithRHSTainted(char *lhs, size_t n) {
+  char rhs[10];
+  scanf("%9s", rhs);
+
+  int cmp_result = strncmp(lhs, rhs, n);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+void testStrncasecmpWithNTainted(char *lhs, char *rhs) {
+  int n;
+  scanf("%d", &n);
+
+  int cmp_result = strncmp(lhs, rhs, n);
+  clang_analyzer_isTainted_int(cmp_result); // expected-warning {{YES}}
+}
+
+size_t strspn(const char *s, const char *accept);
+void testStrspnFirstArgTainted(const char *accept) {
+  char s[10];
+  scanf("%9s", s);
+
+  size_t result = strspn(s, accept);
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+void testStrspnSecondArgTainted(const char *s) {
+  char accept[10];
+  scanf("%9s", accept);
+
+  size_t result = strspn(s, accept);
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+size_t strcspn(const char *s, const char *reject);
+void testStrcspnFirstArgTainted(const char *reject) {
+  char s[10];
+  scanf("%9s", s);
+
+  size_t result = strcspn(s, reject);
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+void testStrcspnSecondArgTainted(const char *s) {
+  char reject[10];
+  scanf("%9s", reject);
+
+  size_t result = strcspn(s, reject);
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+char *strpbrk(const char *s, const char *accept);
+void testStrpbrk(const char *accept) {
+  char s[10];
+  scanf("%9s", s);
+
+  char *result = strpbrk(s, accept);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+char *strndup(const char *s, size_t n);
+void testStrndup(size_t n) {
+  char s[10];
+  scanf("%9s", s);
+
+  char *result = strndup(s, n);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+char *strdupa(const char *s);
+void testStrdupa() {
+  char s[10];
+  scanf("%9s", s);
+
+  char *result = strdupa(s);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+char *strndupa(const char *s, size_t n);
+void testStrndupa(size_t n) {
+  char s[10];
+  scanf("%9s", s);
+
+  char *result = strndupa(s, n);
+  clang_analyzer_isTainted_charp(result); // expected-warning {{YES}}
+}
+
+size_t strlen(const char *s);
+void testStrlen() {
+  char s[10];
+  scanf("%9s", s);
+
+  size_t result = strlen(s);
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+size_t strnlen(const char *s, size_t maxlen);
+void testStrnlen(size_t maxlen) {
+  char s[10];
+  scanf("%9s", s);
+
+  size_t result = strnlen(s, maxlen);
+  clang_analyzer_isTainted_int(result); // expected-warning {{YES}}
+}
+
+long strtol(const char *restrict nptr, char **restrict endptr, int base);
+long long strtoll(const char *restrict nptr, char **restrict endptr, int base);
+unsigned long int strtoul(const char *nptr, char **endptr, int base);
+unsigned long long int strtoull(const char *nptr, char **endptr, int base);
+void testStrtolVariants(char **restrict endptr, int base) {
+  char s[10];
+  scanf("%9s", s);
+
+  long result_l = strtol(s, endptr, base);
+  clang_analyzer_isTainted_int(result_l); // expected-warning {{YES}}
+
+  long long result_ll = strtoll(s, endptr, base);
+  clang_analyzer_isTainted_int(result_ll); // expected-warning {{YES}}
+
+  unsigned long result_ul = strtoul(s, endptr, base);
+  clang_analyzer_isTainted_int(result_ul); // expected-warning {{YES}}
+
+  unsigned long long result_ull = strtoull(s, endptr, base);
+  clang_analyzer_isTainted_int(result_ull); // expected-warning {{YES}}
+}
+
+int isalnum(int c);
+int isalpha(int c);
+int isascii(int c);
+int isblank(int c);
+int iscntrl(int c);
+int isdigit(int c);
+int isgraph(int c);
+int islower(int c);
+int isprint(int c);
+int ispunct(int c);
+int isspace(int c);
+int isupper(int c);
+int isxdigit(int c);
+
+void testIsFunctions() {
+  char c;
+  scanf("%c", &c);
+
+  int alnum = isalnum(c);
+  clang_analyzer_isTainted_int(alnum); // expected-warning {{YES}}
+
+  int alpha = isalpha(c);
+  clang_analyzer_isTainted_int(alpha); // expected-warning {{YES}}
+
+  int ascii = isascii(c);
+  clang_analyzer_isTainted_int(ascii); // expected-warning {{YES}}
+
+  int blank = isblank(c);
+  clang_analyzer_isTainted_int(blank); // expected-warning {{YES}}
+
+  int cntrl = iscntrl(c);
+  clang_analyzer_isTainted_int(cntrl); // expected-warning {{YES}}
+
+  int digit = isdigit(c);
+  clang_analyzer_isTainted_int(digit); // expected-warning {{YES}}
+
+  int graph = isgraph(c);
+  clang_analyzer_isTainted_int(graph); // expected-warning {{YES}}
+
+  int lower = islower(c);
+  clang_analyzer_isTainted_int(lower); // expected-warning {{YES}}
+
+  int print = isprint(c);
+  clang_analyzer_isTainted_int(print); // expected-warning {{YES}}
+
+  int punct = ispunct(c);
+  clang_analyzer_isTainted_int(punct); // expected-warning {{YES}}
+
+  int space = isspace(c);
+  clang_analyzer_isTainted_int(space); // expected-warning {{YES}}
+
+  int upper = isupper(c);
+  clang_analyzer_isTainted_int(upper); // expected-warning {{YES}}
+
+  int xdigit = isxdigit(c);
+  clang_analyzer_isTainted_int(xdigit); // expected-warning {{YES}}
+}
+
+void qsort(void *base, size_t nmemb, size_t size, int (*compar)(const void *, const void *));
+void qsort_r(void *base, size_t nmemb, size_t size, int (*compar)(const void *, const void *, void *), void *arg);
+void testQsort() {
+  int data[1];
+  scanf("%d", data);
+
+  qsort(data, sizeof(data), sizeof(data[0]), NULL);
+  clang_analyzer_isTainted_int(data[0]); // expected-warning {{YES}}
+  qsort_r(data, sizeof(data), sizeof(data[0]), NULL, NULL);
+  clang_analyzer_isTainted_int(data[0]); // expected-warning {{YES}}
+}
+
 // Test configuration
 int mySource1(void);
 void mySource2(int*);
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-fma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-fma.c
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-fma.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-fma.c
@@ -32,12 +32,8 @@
   // CHECK: <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[RESULT]])
 
   vf = __builtin_vsx_xvnmsubasp(vf, vf, vf);
-  // CHECK: [[RESULT:%[^ ]+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[RESULT2:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[RESULT]])
-  // CHECK: fneg <4 x float> [[RESULT2]]
+  // CHECK: call <4 x float> @llvm.ppc.fnmsub.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
 
   vd = __builtin_vsx_xvnmsubadp(vd, vd, vd);
-  // CHECK: [[RESULT:%[^ ]+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[RESULT2:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[RESULT]])
-  // CHECK: fneg <2 x double> [[RESULT2]]
+  // CHECK: call <2 x double> @llvm.ppc.fnmsub.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
 }
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
@@ -142,9 +142,7 @@
 
   vf = __builtin_vsx_xvnmsubasp(vf, vf, vf);
   // CHECK-LABEL: try-xvnmsubasp
-  // CHECK-UNCONSTRAINED: [[RESULT0:%[^ ]+]] = fneg <4 x float> %{{.*}}
-  // CHECK-UNCONSTRAINED: [[RESULT1:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[RESULT0]])
-  // CHECK-UNCONSTRAINED: fneg <4 x float> [[RESULT1]]
+  // CHECK-UNCONSTRAINED: call <4 x float> @llvm.ppc.fnmsub.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   // CHECK-CONSTRAINED: [[RESULT0:%[^ ]+]] = fneg <4 x float> %{{.*}}
   // CHECK-CONSTRAINED: [[RESULT1:%[^ ]+]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[RESULT0]], metadata !"round.tonearest", metadata !"fpexcept.strict")
   // CHECK-CONSTRAINED: fneg <4 x float> [[RESULT1]]
@@ -152,9 +150,7 @@
 
   vd = __builtin_vsx_xvnmsubadp(vd, vd, vd);
   // CHECK-LABEL: try-xvnmsubadp
-  // CHECK-UNCONSTRAINED: [[RESULT0:%[^ ]+]] = fneg <2 x double> %{{.*}}
-  // CHECK-UNCONSTRAINED: [[RESULT1:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[RESULT0]])
-  // CHECK-UNCONSTRAINED: fneg <2 x double> [[RESULT1]]
+  // CHECK-UNCONSTRAINED: call <2 x double> @llvm.ppc.fnmsub.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   // CHECK-CONSTRAINED: [[RESULT0:%[^ ]+]] = fneg <2 x double> %{{.*}}
   // CHECK-CONSTRAINED: [[RESULT1:%[^ ]+]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[RESULT0]], metadata !"round.tonearest", metadata !"fpexcept.strict")
   // CHECK-CONSTRAINED: fneg <2 x double> [[RESULT1]]
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c b/clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-vsx.c
@@ -894,20 +894,12 @@
 // CHECK-LE-NEXT: fneg <2 x double> %[[FM]]
 
   res_vf = vec_nmsub(vf, vf, vf);
-// CHECK: fneg <4 x float> %{{[0-9]+}}
-// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
-// CHECK: fneg <4 x float> %{{[0-9]+}}
-// CHECK-LE: fneg <4 x float> %{{[0-9]+}}
-// CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
-// CHECK-LE: fneg <4 x float> %{{[0-9]+}}
+// CHECK: call <4 x float> @llvm.ppc.fnmsub.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
+// CHECK-LE: call <4 x float> @llvm.ppc.fnmsub.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
 
   res_vd = vec_nmsub(vd, vd, vd);
-// CHECK: fneg <2 x double> %{{[0-9]+}}
-// CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
-// CHECK-NEXT: fneg <2 x double> %[[FM]]
-// CHECK-LE: fneg <2 x double> %{{[0-9]+}}
-// CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
-// CHECK-LE-NEXT: fneg <2 x double> %[[FM]]
+// CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.ppc.fnmsub.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
+// CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.ppc.fnmsub.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
 
   /* vec_nor */
   res_vsll = vec_nor(vsll, vsll);
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.c
@@ -95,10 +95,11 @@
 // CHECK-LABEL: @fnmsub(
 // CHECK:         [[D_ADDR:%.*]] = alloca double, align 8
 // CHECK-NEXT:    store double [[D:%.*]], double* [[D_ADDR]], align 8
+// CHECK-COUNT-3:    load double, double* [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[D_ADDR]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.ppc.fnmsub(double [[TMP0]], double [[TMP1]], double [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.ppc.fnmsub.f64(double [[TMP0]], double [[TMP1]], double [[TMP2]])
 // CHECK-NEXT:    ret double [[TMP3]]
 //
 double fnmsub (double d) {
@@ -108,10 +109,11 @@
 // CHECK-LABEL: @fnmsubs(
 // CHECK:         [[F_ADDR:%.*]] = alloca float, align 4
 // CHECK-NEXT:    store float [[F:%.*]], float* [[F_ADDR]], align 4
+// CHECK-COUNT-3:    load float, float* [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[F_ADDR]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.ppc.fnmsubs(float [[TMP0]], float [[TMP1]], float [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.ppc.fnmsub.f32(float [[TMP0]], float [[TMP1]], float [[TMP2]])
 // CHECK-NEXT:    ret float [[TMP3]]
 //
 float fnmsubs (float f) {
diff --git a/clang/test/Modules/Inputs/AddRemoveIrrelevantModuleMap/a.modulemap b/clang/test/Modules/Inputs/AddRemoveIrrelevantModuleMap/a.modulemap
deleted file mode 100644
--- a/clang/test/Modules/Inputs/AddRemoveIrrelevantModuleMap/a.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module a { }
diff --git a/clang/test/Modules/Inputs/AddRemoveIrrelevantModuleMap/b.modulemap b/clang/test/Modules/Inputs/AddRemoveIrrelevantModuleMap/b.modulemap
deleted file mode 100644
--- a/clang/test/Modules/Inputs/AddRemoveIrrelevantModuleMap/b.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module b { }
diff --git a/clang/test/Modules/add-remove-irrelevant-module-map.m b/clang/test/Modules/add-remove-irrelevant-module-map.m
--- a/clang/test/Modules/add-remove-irrelevant-module-map.m
+++ b/clang/test/Modules/add-remove-irrelevant-module-map.m
@@ -1,16 +1,58 @@
-// RUN: rm -rf %t
-// RUN: rm -rf %t.mcp
-// RUN: mkdir -p %t
+// RUN: rm -rf %t && mkdir %t
+// RUN: split-file %s %t
 
-// Build without b.modulemap
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t.mcp -fdisable-module-hash -fmodule-map-file=%S/Inputs/AddRemoveIrrelevantModuleMap/a.modulemap %s -verify
-// RUN: cp %t.mcp/a.pcm %t/a.pcm
+//--- a.modulemap
+module a {}
 
-// Build with b.modulemap
-// RUN: rm -rf %t.mcp
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t.mcp -fdisable-module-hash -fmodule-map-file=%S/Inputs/AddRemoveIrrelevantModuleMap/a.modulemap -fmodule-map-file=%S/Inputs/AddRemoveIrrelevantModuleMap/b.modulemap %s -verify
-// RUN: not diff %t.mcp/a.pcm %t/a.pcm
+//--- b.modulemap
+module b {}
 
+//--- test-simple.m
 // expected-no-diagnostics
-
 @import a;
+
+// Build without b.modulemap:
+//
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -fdisable-module-hash \
+// RUN:   -fmodule-map-file=%t/a.modulemap %t/test-simple.m -verify
+// RUN: mv %t/cache %t/cache-without-b
+
+// Build with b.modulemap:
+//
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -fdisable-module-hash \
+// RUN:   -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap %t/test-simple.m -verify
+// RUN: mv %t/cache %t/cache-with-b
+
+// Neither PCM file considers 'b.modulemap' an input:
+//
+// RUN: %clang_cc1 -module-file-info %t/cache-without-b/a.pcm | FileCheck %s --check-prefix=CHECK-B
+// RUN: %clang_cc1 -module-file-info %t/cache-with-b/a.pcm | FileCheck %s --check-prefix=CHECK-B
+// CHECK-B-NOT: Input file: {{.*}}b.modulemap
+
+//--- c.modulemap
+module c [no_undeclared_includes] { header "c.h" }
+
+//--- c.h
+#if __has_include("d.h") // This should use 'd.modulemap' in order to determine that 'd.h'
+                         // doesn't exist for 'c' because of its '[no_undeclared_includes]'.
+#endif
+
+//--- d.modulemap
+module d { header "d.h" }
+
+//--- d.h
+// empty
+
+//--- test-no-undeclared-includes.m
+// expected-no-diagnostics
+@import c;
+
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t/cache -fdisable-module-hash \
+// RUN:   -fmodule-map-file=%t/c.modulemap -fmodule-map-file=%t/d.modulemap \
+// RUN:   %t/test-no-undeclared-includes.m -verify
+
+// The PCM file considers 'd.modulemap' an input because it affects the compilation,
+// although it doesn't describe the built module or its imports.
+//
+// RUN: %clang_cc1 -module-file-info %t/cache/c.pcm | FileCheck %s --check-prefix=CHECK-D
+// CHECK-D: Input file: {{.*}}d.modulemap
diff --git a/clang/test/OpenMP/irbuilder_nested_openmp_parallel_empty.c b/clang/test/OpenMP/irbuilder_nested_openmp_parallel_empty.c
--- a/clang/test/OpenMP/irbuilder_nested_openmp_parallel_empty.c
+++ b/clang/test/OpenMP/irbuilder_nested_openmp_parallel_empty.c
@@ -33,8 +33,7 @@
 
 // ALL-LABEL: @_Z17nested_parallel_1Pfid(
 // ALL-NEXT:  entry:
-// ALL-NEXT:    [[STRUCTARG14:%.*]] = alloca { { i32*, double*, float** }*, i32*, double*, float** }, align 8
-// ALL-NEXT:    [[STRUCTARG:%.*]] = alloca { i32*, double*, float** }, align 8
+// ALL-NEXT:    [[STRUCTARG14:%.*]] = alloca { i32*, double*, float** }, align 8
 // ALL-NEXT:    [[R_ADDR:%.*]] = alloca float*, align 8
 // ALL-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // ALL-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
@@ -44,15 +43,13 @@
 // ALL-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // ALL-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // ALL:       omp_parallel:
-// ALL-NEXT:    [[GEP_STRUCTARG:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG14]], i32 0, i32 0
-// ALL-NEXT:    store { i32*, double*, float** }* [[STRUCTARG]], { i32*, double*, float** }** [[GEP_STRUCTARG]], align 8
-// ALL-NEXT:    [[GEP_A_ADDR15:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG14]], i32 0, i32 1
+// ALL-NEXT:    [[GEP_A_ADDR15:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG14]], i32 0, i32 0
 // ALL-NEXT:    store i32* [[A_ADDR]], i32** [[GEP_A_ADDR15]], align 8
-// ALL-NEXT:    [[GEP_B_ADDR16:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG14]], i32 0, i32 2
+// ALL-NEXT:    [[GEP_B_ADDR16:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG14]], i32 0, i32 1
 // ALL-NEXT:    store double* [[B_ADDR]], double** [[GEP_B_ADDR16]], align 8
-// ALL-NEXT:    [[GEP_R_ADDR17:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG14]], i32 0, i32 3
+// ALL-NEXT:    [[GEP_R_ADDR17:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG14]], i32 0, i32 2
 // ALL-NEXT:    store float** [[R_ADDR]], float*** [[GEP_R_ADDR17]], align 8
-// ALL-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { i32*, double*, float** }*, i32*, double*, float** }*)* @_Z17nested_parallel_1Pfid..omp_par.2 to void (i32*, i32*, ...)*), { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG14]])
+// ALL-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i32*, double*, float** }*)* @_Z17nested_parallel_1Pfid..omp_par.2 to void (i32*, i32*, ...)*), { i32*, double*, float** }* [[STRUCTARG14]])
 // ALL-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT13:%.*]]
 // ALL:       omp.par.outlined.exit13:
 // ALL-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -71,9 +68,6 @@
 
 // ALL-LABEL: @_Z17nested_parallel_2Pfid(
 // ALL-NEXT:  entry:
-// ALL-NEXT:    [[STRUCTARG68:%.*]] = alloca { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }, align 8
-// ALL-NEXT:    [[STRUCTARG64:%.*]] = alloca { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }, align 8
-// ALL-NEXT:    [[STRUCTARG59:%.*]] = alloca { i32*, double*, float** }, align 8
 // ALL-NEXT:    [[STRUCTARG:%.*]] = alloca { i32*, double*, float** }, align 8
 // ALL-NEXT:    [[R_ADDR:%.*]] = alloca float*, align 8
 // ALL-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -84,19 +78,13 @@
 // ALL-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // ALL-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // ALL:       omp_parallel:
-// ALL-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }, { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }* [[STRUCTARG68]], i32 0, i32 0
+// ALL-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 0
 // ALL-NEXT:    store i32* [[A_ADDR]], i32** [[GEP_A_ADDR]], align 8
-// ALL-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }, { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }* [[STRUCTARG68]], i32 0, i32 1
+// ALL-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 1
 // ALL-NEXT:    store double* [[B_ADDR]], double** [[GEP_B_ADDR]], align 8
-// ALL-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }, { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }* [[STRUCTARG68]], i32 0, i32 2
+// ALL-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 2
 // ALL-NEXT:    store float** [[R_ADDR]], float*** [[GEP_R_ADDR]], align 8
-// ALL-NEXT:    [[GEP_STRUCTARG64:%.*]] = getelementptr { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }, { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }* [[STRUCTARG68]], i32 0, i32 3
-// ALL-NEXT:    store { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }* [[STRUCTARG64]], { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }** [[GEP_STRUCTARG64]], align 8
-// ALL-NEXT:    [[GEP_STRUCTARG69:%.*]] = getelementptr { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }, { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }* [[STRUCTARG68]], i32 0, i32 4
-// ALL-NEXT:    store { i32*, double*, float** }* [[STRUCTARG]], { i32*, double*, float** }** [[GEP_STRUCTARG69]], align 8
-// ALL-NEXT:    [[GEP_STRUCTARG5970:%.*]] = getelementptr { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }, { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }* [[STRUCTARG68]], i32 0, i32 5
-// ALL-NEXT:    store { i32*, double*, float** }* [[STRUCTARG59]], { i32*, double*, float** }** [[GEP_STRUCTARG5970]], align 8
-// ALL-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }*)* @_Z17nested_parallel_2Pfid..omp_par.5 to void (i32*, i32*, ...)*), { i32*, double*, float**, { i32*, double*, float**, { i32*, double*, float** }*, { i32*, double*, float** }* }*, { i32*, double*, float** }*, { i32*, double*, float** }* }* [[STRUCTARG68]])
+// ALL-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i32*, double*, float** }*)* @_Z17nested_parallel_2Pfid..omp_par.5 to void (i32*, i32*, ...)*), { i32*, double*, float** }* [[STRUCTARG]])
 // ALL-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT55:%.*]]
 // ALL:       omp.par.outlined.exit55:
 // ALL-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
diff --git a/clang/test/OpenMP/irbuilder_nested_parallel_for.c b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
--- a/clang/test/OpenMP/irbuilder_nested_parallel_for.c
+++ b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
@@ -44,8 +44,7 @@
 
 // CHECK-LABEL: @_Z14parallel_for_1Pfid(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[STRUCTARG17:%.*]] = alloca { { i32*, double*, float** }*, i32*, double*, float** }, align 8
-// CHECK-NEXT:    [[STRUCTARG:%.*]] = alloca { i32*, double*, float** }, align 8
+// CHECK-NEXT:    [[STRUCTARG17:%.*]] = alloca { i32*, double*, float** }, align 8
 // CHECK-NEXT:    [[R_ADDR:%.*]] = alloca float*, align 8
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
@@ -55,15 +54,13 @@
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK:       omp_parallel:
-// CHECK-NEXT:    [[GEP_STRUCTARG:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 0
-// CHECK-NEXT:    store { i32*, double*, float** }* [[STRUCTARG]], { i32*, double*, float** }** [[GEP_STRUCTARG]], align 8
-// CHECK-NEXT:    [[GEP_A_ADDR18:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 1
+// CHECK-NEXT:    [[GEP_A_ADDR18:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 0
 // CHECK-NEXT:    store i32* [[A_ADDR]], i32** [[GEP_A_ADDR18]], align 8
-// CHECK-NEXT:    [[GEP_B_ADDR19:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 2
+// CHECK-NEXT:    [[GEP_B_ADDR19:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 1
 // CHECK-NEXT:    store double* [[B_ADDR]], double** [[GEP_B_ADDR19]], align 8
-// CHECK-NEXT:    [[GEP_R_ADDR20:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 3
+// CHECK-NEXT:    [[GEP_R_ADDR20:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 2
 // CHECK-NEXT:    store float** [[R_ADDR]], float*** [[GEP_R_ADDR20]], align 8
-// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { i32*, double*, float** }*, i32*, double*, float** }*)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]])
+// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i32*, double*, float** }*)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), { i32*, double*, float** }* [[STRUCTARG17]])
 // CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT16:%.*]]
 // CHECK:       omp.par.outlined.exit16:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -72,34 +69,31 @@
 //
 // CHECK-DEBUG-LABEL: @_Z14parallel_for_1Pfid(
 // CHECK-DEBUG-NEXT:  entry:
-// CHECK-DEBUG-NEXT:    [[STRUCTARG17:%.*]] = alloca { { i32*, double*, float** }*, i32*, double*, float** }, align 8
-// CHECK-DEBUG-NEXT:    [[STRUCTARG:%.*]] = alloca { i32*, double*, float** }, align 8
+// CHECK-DEBUG-NEXT:    [[STRUCTARG17:%.*]] = alloca { i32*, double*, float** }, align 8
 // CHECK-DEBUG-NEXT:    [[R_ADDR:%.*]] = alloca float*, align 8
 // CHECK-DEBUG-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
 // CHECK-DEBUG-NEXT:    store float* [[R:%.*]], float** [[R_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG75:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]]), !dbg [[DBG79:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]]), !dbg [[DBG78:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK-DEBUG:       omp_parallel:
-// CHECK-DEBUG-NEXT:    [[GEP_STRUCTARG:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    store { i32*, double*, float** }* [[STRUCTARG]], { i32*, double*, float** }** [[GEP_STRUCTARG]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_A_ADDR18:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 1
+// CHECK-DEBUG-NEXT:    [[GEP_A_ADDR18:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 0
 // CHECK-DEBUG-NEXT:    store i32* [[A_ADDR]], i32** [[GEP_A_ADDR18]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_B_ADDR19:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 2
+// CHECK-DEBUG-NEXT:    [[GEP_B_ADDR19:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 1
 // CHECK-DEBUG-NEXT:    store double* [[B_ADDR]], double** [[GEP_B_ADDR19]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_R_ADDR20:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 3
+// CHECK-DEBUG-NEXT:    [[GEP_R_ADDR20:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 2
 // CHECK-DEBUG-NEXT:    store float** [[R_ADDR]], float*** [[GEP_R_ADDR20]], align 8
-// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB6]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { i32*, double*, float** }*, i32*, double*, float** }*)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]]), !dbg [[DBG80:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB6]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i32*, double*, float** }*)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), { i32*, double*, float** }* [[STRUCTARG17]]), !dbg [[DBG79:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT16:%.*]]
 // CHECK-DEBUG:       omp.par.outlined.exit16:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 // CHECK-DEBUG:       omp.par.exit.split:
-// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG82:![0-9]+]]
+// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG81:![0-9]+]]
 //
 void parallel_for_1(float *r, int a, double b) {
 #pragma omp parallel
@@ -116,9 +110,6 @@
 
 // CHECK-LABEL: @_Z14parallel_for_2Pfid(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[STRUCTARG218:%.*]] = alloca { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, align 8
-// CHECK-NEXT:    [[STRUCTARG214:%.*]] = alloca { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, align 8
-// CHECK-NEXT:    [[STRUCTARG209:%.*]] = alloca { i32*, double*, float** }, align 8
 // CHECK-NEXT:    [[STRUCTARG:%.*]] = alloca { i32*, double*, float** }, align 8
 // CHECK-NEXT:    [[R_ADDR:%.*]] = alloca float*, align 8
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -137,19 +128,13 @@
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK:       omp_parallel:
-// CHECK-NEXT:    [[GEP_STRUCTARG214:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 0
-// CHECK-NEXT:    store { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG214]], { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }** [[GEP_STRUCTARG214]], align 8
-// CHECK-NEXT:    [[GEP_STRUCTARG219:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 1
-// CHECK-NEXT:    store { i32*, double*, float** }* [[STRUCTARG]], { i32*, double*, float** }** [[GEP_STRUCTARG219]], align 8
-// CHECK-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 2
+// CHECK-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 0
 // CHECK-NEXT:    store i32* [[A_ADDR]], i32** [[GEP_A_ADDR]], align 8
-// CHECK-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 3
+// CHECK-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 1
 // CHECK-NEXT:    store double* [[B_ADDR]], double** [[GEP_B_ADDR]], align 8
-// CHECK-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 4
+// CHECK-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 2
 // CHECK-NEXT:    store float** [[R_ADDR]], float*** [[GEP_R_ADDR]], align 8
-// CHECK-NEXT:    [[GEP_STRUCTARG209220:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 5
-// CHECK-NEXT:    store { i32*, double*, float** }* [[STRUCTARG209]], { i32*, double*, float** }** [[GEP_STRUCTARG209220]], align 8
-// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]])
+// CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i32*, double*, float** }*)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), { i32*, double*, float** }* [[STRUCTARG]])
 // CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT184:%.*]]
 // CHECK:       omp.par.outlined.exit184:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -205,9 +190,6 @@
 //
 // CHECK-DEBUG-LABEL: @_Z14parallel_for_2Pfid(
 // CHECK-DEBUG-NEXT:  entry:
-// CHECK-DEBUG-NEXT:    [[STRUCTARG218:%.*]] = alloca { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, align 8
-// CHECK-DEBUG-NEXT:    [[STRUCTARG214:%.*]] = alloca { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, align 8
-// CHECK-DEBUG-NEXT:    [[STRUCTARG209:%.*]] = alloca { i32*, double*, float** }, align 8
 // CHECK-DEBUG-NEXT:    [[STRUCTARG:%.*]] = alloca { i32*, double*, float** }, align 8
 // CHECK-DEBUG-NEXT:    [[R_ADDR:%.*]] = alloca float*, align 8
 // CHECK-DEBUG-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -221,80 +203,74 @@
 // CHECK-DEBUG-NEXT:    [[P_UPPERBOUND205:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[P_STRIDE206:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    store float* [[R:%.*]], float** [[R_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG135:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META136:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB13:[0-9]+]]), !dbg [[DBG140:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG138:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB13:[0-9]+]]), !dbg [[DBG139:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK-DEBUG:       omp_parallel:
-// CHECK-DEBUG-NEXT:    [[GEP_STRUCTARG214:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    store { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG214]], { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }** [[GEP_STRUCTARG214]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_STRUCTARG219:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 1
-// CHECK-DEBUG-NEXT:    store { i32*, double*, float** }* [[STRUCTARG]], { i32*, double*, float** }** [[GEP_STRUCTARG219]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 2
+// CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 0
 // CHECK-DEBUG-NEXT:    store i32* [[A_ADDR]], i32** [[GEP_A_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 3
+// CHECK-DEBUG-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 1
 // CHECK-DEBUG-NEXT:    store double* [[B_ADDR]], double** [[GEP_B_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 4
+// CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { i32*, double*, float** }, { i32*, double*, float** }* [[STRUCTARG]], i32 0, i32 2
 // CHECK-DEBUG-NEXT:    store float** [[R_ADDR]], float*** [[GEP_R_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_STRUCTARG209220:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 5
-// CHECK-DEBUG-NEXT:    store { i32*, double*, float** }* [[STRUCTARG209]], { i32*, double*, float** }** [[GEP_STRUCTARG209220]], align 8
-// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB13]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]]), !dbg [[DBG141:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB13]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i32*, double*, float** }*)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), { i32*, double*, float** }* [[STRUCTARG]]), !dbg [[DBG140:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT184:%.*]]
 // CHECK-DEBUG:       omp.par.outlined.exit184:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 // CHECK-DEBUG:       omp.par.exit.split:
-// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[I185]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store i32 0, i32* [[I185]], align 4, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON_17]], %struct.anon.17* [[AGG_CAPTURED186]], i32 0, i32 0, !dbg [[DBG149:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store i32* [[I185]], i32** [[TMP0]], align 8, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_18]], %struct.anon.18* [[AGG_CAPTURED187]], i32 0, i32 0, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I185]], align 4, !dbg [[DBG150:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store i32 [[TMP2]], i32* [[TMP1]], align 4, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    call void @__captured_stmt.19(i32* [[DOTCOUNT_ADDR188]], %struct.anon.17* [[AGG_CAPTURED186]]), !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[DOTCOUNT189:%.*]] = load i32, i32* [[DOTCOUNT_ADDR188]], align 4, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER190:%.*]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT:    call void @llvm.dbg.declare(metadata i32* [[I185]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]]
+// CHECK-DEBUG-NEXT:    store i32 0, i32* [[I185]], align 4, !dbg [[DBG147]]
+// CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON_17]], %struct.anon.17* [[AGG_CAPTURED186]], i32 0, i32 0, !dbg [[DBG148:![0-9]+]]
+// CHECK-DEBUG-NEXT:    store i32* [[I185]], i32** [[TMP0]], align 8, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_18]], %struct.anon.18* [[AGG_CAPTURED187]], i32 0, i32 0, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I185]], align 4, !dbg [[DBG149:![0-9]+]]
+// CHECK-DEBUG-NEXT:    store i32 [[TMP2]], i32* [[TMP1]], align 4, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.19(i32* [[DOTCOUNT_ADDR188]], %struct.anon.17* [[AGG_CAPTURED186]]), !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT189:%.*]] = load i32, i32* [[DOTCOUNT_ADDR188]], align 4, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER190:%.*]], !dbg [[DBG148]]
 // CHECK-DEBUG:       omp_loop.preheader190:
-// CHECK-DEBUG-NEXT:    store i32 0, i32* [[P_LOWERBOUND204]], align 4, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    store i32 1, i32* [[P_STRIDE206]], align 4, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB42:[0-9]+]]), !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 0), !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER191:%.*]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT:    store i32 0, i32* [[P_LOWERBOUND204]], align 4, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    store i32 1, i32* [[P_STRIDE206]], align 4, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB42:[0-9]+]]), !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 0), !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER191:%.*]], !dbg [[DBG148]]
 // CHECK-DEBUG:       omp_loop.header191:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ], !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND192:%.*]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND192:%.*]], !dbg [[DBG148]]
 // CHECK-DEBUG:       omp_loop.cond192:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]], !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]], !dbg [[DBG148]]
 // CHECK-DEBUG:       omp_loop.body193:
-// CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]], !dbg [[DBG151:![0-9]+]]
-// CHECK-DEBUG-NEXT:    call void @__captured_stmt.20(i32* [[I185]], i32 [[TMP8]], %struct.anon.18* [[AGG_CAPTURED187]]), !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG152:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[CONV200:%.*]] = sitofp i32 [[TMP9]] to double, !dbg [[DBG152]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load double, double* [[B_ADDR]], align 8, !dbg [[DBG151]]
-// CHECK-DEBUG-NEXT:    [[ADD201:%.*]] = fadd double [[CONV200]], [[TMP10]], !dbg [[DBG153:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[CONV202:%.*]] = fptrunc double [[ADD201]] to float, !dbg [[DBG152]]
-// CHECK-DEBUG-NEXT:    [[TMP11:%.*]] = load float*, float** [[R_ADDR]], align 8, !dbg [[DBG154:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store float [[CONV202]], float* [[TMP11]], align 4, !dbg [[DBG155:![0-9]+]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC194]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]], !dbg [[DBG150:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.20(i32* [[I185]], i32 [[TMP8]], %struct.anon.18* [[AGG_CAPTURED187]]), !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG151:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[CONV200:%.*]] = sitofp i32 [[TMP9]] to double, !dbg [[DBG151]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load double, double* [[B_ADDR]], align 8, !dbg [[DBG150]]
+// CHECK-DEBUG-NEXT:    [[ADD201:%.*]] = fadd double [[CONV200]], [[TMP10]], !dbg [[DBG152:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[CONV202:%.*]] = fptrunc double [[ADD201]] to float, !dbg [[DBG151]]
+// CHECK-DEBUG-NEXT:    [[TMP11:%.*]] = load float*, float** [[R_ADDR]], align 8, !dbg [[DBG153:![0-9]+]]
+// CHECK-DEBUG-NEXT:    store float [[CONV202]], float* [[TMP11]], align 4, !dbg [[DBG154:![0-9]+]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC194]], !dbg [[DBG148]]
 // CHECK-DEBUG:       omp_loop.inc194:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1, !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER191]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER191]], !dbg [[DBG148]]
 // CHECK-DEBUG:       omp_loop.exit195:
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]]), !dbg [[DBG149]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB42]]), !dbg [[DBG151]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB43:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM208]]), !dbg [[DBG151]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER196:%.*]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]]), !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB42]]), !dbg [[DBG150]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB43:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM208]]), !dbg [[DBG150]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER196:%.*]], !dbg [[DBG148]]
 // CHECK-DEBUG:       omp_loop.after196:
-// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG156:![0-9]+]]
+// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG155:![0-9]+]]
 //
 void parallel_for_2(float *r, int a, double b) {
 #pragma omp parallel
diff --git a/clang/test/Syntax/lex.c b/clang/test/Syntax/lex.c
--- a/clang/test/Syntax/lex.c
+++ b/clang/test/Syntax/lex.c
@@ -39,7 +39,7 @@
 TOKEN-NEXT: raw_identifier   5:0 "endif"
 TOKEN-NEXT: r_brace          6:0 "}" flags=1
 
-RUN: clang-pseudo -source %s -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace
+RUN: clang-pseudo -source %s -print-directive-map | FileCheck %s -check-prefix=PPS --strict-whitespace
      PPS: code (5 tokens)
 PPS-NEXT: #ifndef (3 tokens)
 PPS-NEXT:   code (4 tokens)
diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp
--- a/clang/tools/clang-pseudo/ClangPseudo.cpp
+++ b/clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -7,10 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/LangOptions.h"
+#include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h"
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
 #include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
 #include "clang/Tooling/Syntax/Pseudo/LRTable.h"
-#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
 #include "clang/Tooling/Syntax/Pseudo/Token.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
@@ -33,8 +33,8 @@
 static opt<bool> PrintSource("print-source", desc("Print token stream"));
 static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
 static opt<bool>
-    PrintPPStructure("print-pp-structure",
-                     desc("Print directive structure of source code"));
+    PrintDirectiveMap("print-directive-map",
+                      desc("Print directive structure of source code"));
 
 static std::string readOrDie(llvm::StringRef Path) {
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
@@ -76,9 +76,9 @@
     std::string Text = readOrDie(Source);
     clang::LangOptions LangOpts; // FIXME: use real options.
     auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
-    auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream);
+    auto Structure = clang::syntax::pseudo::DirectiveMap::parse(Stream);
 
-    if (PrintPPStructure)
+    if (PrintDirectiveMap)
       llvm::outs() << Structure;
     if (PrintSource)
       Stream.print(llvm::outs());
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -12669,6 +12669,13 @@
                "};",
                MergeInlineOnly);
   verifyFormat("int f() {}", MergeInlineOnly);
+  // https://llvm.org/PR54147
+  verifyFormat("auto lambda = []() {\n"
+               "  // comment\n"
+               "  f();\n"
+               "  g();\n"
+               "};",
+               MergeInlineOnly);
 
   // Also verify behavior when BraceWrapping.AfterFunction = true
   MergeInlineOnly.BreakBeforeBraces = FormatStyle::BS_Custom;
diff --git a/clang/unittests/Tooling/LookupTest.cpp b/clang/unittests/Tooling/LookupTest.cpp
--- a/clang/unittests/Tooling/LookupTest.cpp
+++ b/clang/unittests/Tooling/LookupTest.cpp
@@ -8,12 +8,15 @@
 
 #include "clang/Tooling/Refactoring/Lookup.h"
 #include "TestVisitor.h"
+#include "clang/AST/TypeLoc.h"
+#include "clang/Basic/SourceLocation.h"
 using namespace clang;
 
 namespace {
 struct GetDeclsVisitor : TestVisitor<GetDeclsVisitor> {
   std::function<void(CallExpr *)> OnCall;
   std::function<void(RecordTypeLoc)> OnRecordTypeLoc;
+  std::function<void(UsingTypeLoc)> OnUsingTypeLoc;
   SmallVector<Decl *, 4> DeclStack;
 
   bool VisitCallExpr(CallExpr *Expr) {
@@ -28,6 +31,12 @@
     return true;
   }
 
+  bool VisitUsingTypeLoc(UsingTypeLoc Loc) {
+    if (OnUsingTypeLoc)
+      OnUsingTypeLoc(Loc);
+    return true;
+  }
+
   bool TraverseDecl(Decl *D) {
     DeclStack.push_back(D);
     bool Ret = TestVisitor::TraverseDecl(D);
@@ -181,19 +190,19 @@
 TEST(LookupTest, replaceNestedClassName) {
   GetDeclsVisitor Visitor;
 
-  auto replaceRecordTypeLoc = [&](RecordTypeLoc TLoc,
-                                  StringRef ReplacementString) {
-    const auto *FD = cast<CXXRecordDecl>(TLoc.getDecl());
+  auto replaceTypeLoc = [&](const NamedDecl *ND, SourceLocation Loc,
+                            StringRef ReplacementString) {
     return tooling::replaceNestedName(
-        nullptr, TLoc.getBeginLoc(), Visitor.DeclStack.back()->getDeclContext(),
-        FD, ReplacementString);
+        nullptr, Loc, Visitor.DeclStack.back()->getDeclContext(), ND,
+        ReplacementString);
   };
 
   Visitor.OnRecordTypeLoc = [&](RecordTypeLoc Type) {
     // Filter Types by name since there are other `RecordTypeLoc` in the test
     // file.
     if (Type.getDecl()->getQualifiedNameAsString() == "a::b::Foo") {
-      EXPECT_EQ("x::Bar", replaceRecordTypeLoc(Type, "::a::x::Bar"));
+      EXPECT_EQ("x::Bar", replaceTypeLoc(Type.getDecl(), Type.getBeginLoc(),
+                                         "::a::x::Bar"));
     }
   };
   Visitor.runOver("namespace a { namespace b {\n"
@@ -201,12 +210,13 @@
                   "namespace c { Foo f();; }\n"
                   "} }\n");
 
-  Visitor.OnRecordTypeLoc = [&](RecordTypeLoc Type) {
+  Visitor.OnUsingTypeLoc = [&](UsingTypeLoc Type) {
     // Filter Types by name since there are other `RecordTypeLoc` in the test
     // file.
     // `a::b::Foo` in using shadow decl is not `TypeLoc`.
-    if (Type.getDecl()->getQualifiedNameAsString() == "a::b::Foo") {
-      EXPECT_EQ("Bar", replaceRecordTypeLoc(Type, "::a::x::Bar"));
+    auto *TD = Type.getFoundDecl()->getTargetDecl();
+    if (TD->getQualifiedNameAsString() == "a::b::Foo") {
+      EXPECT_EQ("Bar", replaceTypeLoc(TD, Type.getBeginLoc(), "::a::x::Bar"));
     }
   };
   Visitor.runOver("namespace a { namespace b { class Foo {}; } }\n"
@@ -218,7 +228,8 @@
   // it's not visible at [0].
   Visitor.OnRecordTypeLoc = [&](RecordTypeLoc Type) {
     if (Type.getDecl()->getQualifiedNameAsString() == "x::y::Old") {
-      EXPECT_EQ("Foo", replaceRecordTypeLoc(Type, "::x::Foo"));
+      EXPECT_EQ("Foo",
+                replaceTypeLoc(Type.getDecl(), Type.getBeginLoc(), "::x::Foo"));
     }
   };
   Visitor.runOver(R"(
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,9 +3,9 @@
   )
 
 add_clang_unittest(ClangPseudoTests
+  DirectiveMapTest.cpp
   GrammarTest.cpp
   LRTableTest.cpp
-  PreprocessTest.cpp
   TokenTest.cpp
 )
 
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/DirectiveMapTest.cpp
rename from clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
rename to clang/unittests/Tooling/Syntax/Pseudo/DirectiveMapTest.cpp
--- a/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
+++ b/clang/unittests/Tooling/Syntax/Pseudo/DirectiveMapTest.cpp
@@ -1,4 +1,4 @@
-//===--- TokenTest.cpp ----------------------------------------------------===//
+//===--- DirectiveMapTest.cpp ---------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h"
 
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TokenKinds.h"
@@ -26,7 +26,7 @@
 using testing::Matcher;
 using testing::Pair;
 using testing::StrEq;
-using Chunk = PPStructure::Chunk;
+using Chunk = DirectiveMap::Chunk;
 
 MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
   std::vector<llvm::StringRef> Texts;
@@ -38,7 +38,7 @@
 
 MATCHER_P(chunkKind, K, "") { return arg.kind() == K; }
 
-TEST(PPStructure, Parse) {
+TEST(DirectiveMap, Parse) {
   LangOptions Opts;
   std::string Code = R"cpp(
   #include <foo.h>
@@ -57,30 +57,30 @@
   )cpp";
 
   TokenStream S = cook(lex(Code, Opts), Opts);
-  PPStructure PP = PPStructure::parse(S);
+  DirectiveMap PP = DirectiveMap::parse(S);
 
   ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive),
                                      chunkKind(Chunk::K_Code),
                                      chunkKind(Chunk::K_Conditional),
                                      chunkKind(Chunk::K_Code)));
 
-  EXPECT_THAT((const PPStructure::Directive &)PP.Chunks[0],
+  EXPECT_THAT((const DirectiveMap::Directive &)PP.Chunks[0],
               tokensAre(S, "# include < foo . h >"));
-  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[1],
+  EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[1],
               tokensAre(S, "int main ( ) {"));
-  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[3], tokensAre(S, "}"));
+  EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[3], tokensAre(S, "}"));
 
-  const PPStructure::Conditional &Ifdef(PP.Chunks[2]);
+  const DirectiveMap::Conditional &Ifdef(PP.Chunks[2]);
   EXPECT_THAT(Ifdef.Branches,
               ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
                           Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
   EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
 
-  const PPStructure &HasFoo(Ifdef.Branches[0].second);
-  const PPStructure &NeedsFoo(Ifdef.Branches[1].second);
+  const DirectiveMap &HasFoo(Ifdef.Branches[0].second);
+  const DirectiveMap &NeedsFoo(Ifdef.Branches[1].second);
 
   EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional)));
-  const PPStructure::Conditional &If(HasFoo.Chunks[0]);
+  const DirectiveMap::Conditional &If(HasFoo.Chunks[0]);
   EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
                                        Pair(tokensAre(S, "# else"), _)));
   EXPECT_THAT(If.Branches[0].second.Chunks,
@@ -89,12 +89,12 @@
               ElementsAre(chunkKind(Chunk::K_Code)));
 
   EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive)));
-  const PPStructure::Directive &Error(NeedsFoo.Chunks[0]);
+  const DirectiveMap::Directive &Error(NeedsFoo.Chunks[0]);
   EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
   EXPECT_EQ(Error.Kind, tok::pp_error);
 }
 
-TEST(PPStructure, ParseUgly) {
+TEST(DirectiveMap, ParseUgly) {
   LangOptions Opts;
   std::string Code = R"cpp(
   /*A*/ # /*B*/ \
@@ -104,19 +104,19 @@
 /*E*/
 )cpp";
   TokenStream S = cook(lex(Code, Opts), Opts);
-  PPStructure PP = PPStructure::parse(S);
+  DirectiveMap PP = DirectiveMap::parse(S);
 
   ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
                                      chunkKind(Chunk::K_Directive),
                                      chunkKind(Chunk::K_Code)));
-  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
-  const PPStructure::Directive &Define(PP.Chunks[1]);
+  EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
+  const DirectiveMap::Directive &Define(PP.Chunks[1]);
   EXPECT_EQ(Define.Kind, tok::pp_define);
   EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
-  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
+  EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
 }
 
-TEST(PPStructure, ParseBroken) {
+TEST(DirectiveMap, ParseBroken) {
   LangOptions Opts;
   std::string Code = R"cpp(
   a
@@ -125,17 +125,17 @@
   b
 )cpp";
   TokenStream S = cook(lex(Code, Opts), Opts);
-  PPStructure PP = PPStructure::parse(S);
+  DirectiveMap PP = DirectiveMap::parse(S);
 
   ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
                                      chunkKind(Chunk::K_Directive),
                                      chunkKind(Chunk::K_Conditional)));
-  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "a"));
-  const PPStructure::Directive &Endif(PP.Chunks[1]);
+  EXPECT_THAT((const DirectiveMap::Code &)PP.Chunks[0], tokensAre(S, "a"));
+  const DirectiveMap::Directive &Endif(PP.Chunks[1]);
   EXPECT_EQ(Endif.Kind, tok::pp_endif);
   EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
 
-  const PPStructure::Conditional &X(PP.Chunks[2]);
+  const DirectiveMap::Conditional &X(PP.Chunks[2]);
   EXPECT_EQ(1u, X.Branches.size());
   // The (only) branch of the broken conditional section runs until eof.
   EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -41,7 +41,7 @@
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Support/MlirOptMain.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 #include "flang/Optimizer/Support/InitFIR.h"
 #include "flang/Optimizer/Transforms/Passes.h"
diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp
--- a/flang/tools/tco/tco.cpp
+++ b/flang/tools/tco/tco.cpp
@@ -20,7 +20,7 @@
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -79,7 +79,7 @@
   mlir::MLIRContext context(registry);
   fir::support::loadDialects(context);
   fir::support::registerLLVMTranslation(context);
-  auto owningRef = mlir::parseSourceFile(sourceMgr, &context);
+  auto owningRef = mlir::parseSourceFile<mlir::ModuleOp>(sourceMgr, &context);
 
   if (!owningRef) {
     errs() << "Error can't load file " << inputFilename << '\n';
diff --git a/libc/loader/linux/x86_64/start.cpp b/libc/loader/linux/x86_64/start.cpp
--- a/libc/loader/linux/x86_64/start.cpp
+++ b/libc/loader/linux/x86_64/start.cpp
@@ -38,7 +38,9 @@
     return;
 
   // We will assume the alignment is always a power of two.
-  uintptr_t tlsSize = (app.tls.size + app.tls.align) & -app.tls.align;
+  uintptr_t tlsSize = app.tls.size & -app.tls.align;
+  if (tlsSize != app.tls.size)
+    tlsSize += app.tls.align;
 
   // Per the x86_64 TLS ABI, the entry pointed to by the thread pointer is the
   // address of the TLS block. So, we add more size to accomodate this address
diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -39,7 +39,7 @@
 ------------
 
  - Implemented P0627R6 (Function to mark unreachable code)
- - Implemented P1165R1 (Make stateful allocator propagation more consistent for operator+(basic_string))
+ - Implemented P1165R1 (Make stateful allocator propagation more consistent for ``operator+(basic_string)``)
 
 API Changes
 -----------
@@ -49,6 +49,7 @@
   they were not supposed to set ``_LIBCPP_ABI_UNSTABLE`` manually, however we
   still feel that it is worth mentioning in the release notes in case some users
   had been doing it.
+
 - The header ``<experimental/filesystem>`` has been removed. Instead, use
   ``<filesystem>`` header. The associated macro
   ``_LIBCPP_DEPRECATED_EXPERIMENTAL_FILESYSTEM`` has also been removed.
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -66,6 +66,7 @@
   __algorithm/pop_heap.h
   __algorithm/prev_permutation.h
   __algorithm/push_heap.h
+  __algorithm/ranges_max_element.h
   __algorithm/ranges_min_element.h
   __algorithm/ranges_swap_ranges.h
   __algorithm/remove.h
@@ -448,6 +449,7 @@
   ctgmath
   ctime
   ctype.h
+  cuchar
   cwchar
   cwctype
   deque
@@ -541,6 +543,7 @@
   type_traits
   typeindex
   typeinfo
+  uchar.h
   unordered_map
   unordered_set
   utility
diff --git a/libcxx/include/__algorithm/ranges_max_element.h b/libcxx/include/__algorithm/ranges_max_element.h
new file mode 100644
--- /dev/null
+++ b/libcxx/include/__algorithm/ranges_max_element.h
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_RANGES_MAX_ELEMENT_H
+#define _LIBCPP___ALGORITHM_RANGES_MAX_ELEMENT_H
+
+#include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
+#include <__functional/ranges_operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/projected.h>
+#include <__ranges/access.h>
+#include <__ranges/concepts.h>
+#include <__ranges/dangling.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace ranges {
+namespace __max_element {
+struct __fn {
+  template <class _Ip, class _Sp, class _Proj, class _Comp>
+  _LIBCPP_HIDE_FROM_ABI static constexpr _Ip __go(_Ip __first, _Sp __last, _Comp& __comp, _Proj& __proj) {
+    if (__first == __last)
+      return __first;
+
+    _Ip __i = __first;
+    while (++__i != __last)
+      if (std::invoke(__comp, std::invoke(__proj, *__first), std::invoke(__proj, *__i)))
+        __first = __i;
+    return __first;
+  }
+
+  template <forward_iterator _Ip, sentinel_for<_Ip> _Sp, class _Proj = identity,
+            indirect_strict_weak_order<projected<_Ip, _Proj>> _Comp = ranges::less>
+  _LIBCPP_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __first, _Sp __last, _Comp __comp = {}, _Proj __proj = {}) const {
+    return __go(__first, __last, __comp, __proj);
+  }
+
+  template <forward_range _Rp, class _Proj = identity,
+            indirect_strict_weak_order<projected<iterator_t<_Rp>, _Proj>> _Comp = ranges::less>
+  _LIBCPP_HIDE_FROM_ABI constexpr borrowed_iterator_t<_Rp> operator()(_Rp&& __r, _Comp __comp = {},
+                                                                      _Proj __proj = {}) const {
+    return __go(ranges::begin(__r), ranges::end(__r), __comp, __proj);
+  }
+};
+} // namespace __max_element
+
+inline namespace __cpo {
+inline constexpr auto max_element = __max_element::__fn{};
+} // namespace __cpo
+} // namespace ranges
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
+#endif // _LIBCPP___ALGORITHM_RANGES_MAX_ELEMENT_H
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -46,10 +46,8 @@
                      0) _NOEXCEPT {}
 
   _LIBCPP_INLINE_VISIBILITY void operator()(_Tp* __ptr) const _NOEXCEPT {
-    static_assert(sizeof(_Tp) > 0,
-                  "default_delete can not delete incomplete type");
-    static_assert(!is_void<_Tp>::value,
-                  "default_delete can not delete incomplete type");
+    static_assert(sizeof(_Tp) >= 0, "cannot delete an incomplete type");
+    static_assert(!is_void<_Tp>::value, "cannot delete an incomplete type");
     delete __ptr;
   }
 };
@@ -77,10 +75,7 @@
   _LIBCPP_INLINE_VISIBILITY
   typename _EnableIfConvertible<_Up>::type
   operator()(_Up* __ptr) const _NOEXCEPT {
-    static_assert(sizeof(_Tp) > 0,
-                  "default_delete can not delete incomplete type");
-    static_assert(!is_void<_Tp>::value,
-                  "default_delete can not delete void type");
+    static_assert(sizeof(_Up) >= 0, "cannot delete an incomplete type");
     delete[] __ptr;
   }
 };
diff --git a/libcxx/include/__ranges/access.h b/libcxx/include/__ranges/access.h
--- a/libcxx/include/__ranges/access.h
+++ b/libcxx/include/__ranges/access.h
@@ -58,14 +58,14 @@
   struct __fn {
     template <class _Tp>
     [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Tp (&__t)[]) const noexcept
-      requires (sizeof(_Tp) != 0)  // Disallow incomplete element types.
+        requires(sizeof(_Tp) >= 0) // Disallow incomplete element types.
     {
       return __t + 0;
     }
 
     template <class _Tp, size_t _Np>
     [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Tp (&__t)[_Np]) const noexcept
-      requires (sizeof(_Tp) != 0)  // Disallow incomplete element types.
+        requires(sizeof(_Tp) >= 0) // Disallow incomplete element types.
     {
       return __t + 0;
     }
@@ -132,7 +132,7 @@
   public:
     template <class _Tp, size_t _Np>
     [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Tp (&__t)[_Np]) const noexcept
-      requires (sizeof(_Tp) != 0)  // Disallow incomplete element types.
+        requires(sizeof(_Tp) >= 0) // Disallow incomplete element types.
     {
       return __t + _Np;
     }
diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -763,6 +763,7 @@
 #include <__algorithm/pop_heap.h>
 #include <__algorithm/prev_permutation.h>
 #include <__algorithm/push_heap.h>
+#include <__algorithm/ranges_max_element.h>
 #include <__algorithm/ranges_min_element.h>
 #include <__algorithm/ranges_swap_ranges.h>
 #include <__algorithm/remove.h>
diff --git a/libcxx/include/cuchar b/libcxx/include/cuchar
new file mode 100644
--- /dev/null
+++ b/libcxx/include/cuchar
@@ -0,0 +1,60 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_CUCHAR
+#define _LIBCPP_CUCHAR
+
+/*
+    cuchar synopsis // since C++11
+
+Macros:
+
+    __STDC_UTF_16__
+    __STDC_UTF_32__
+
+namespace std {
+
+Types:
+
+  mbstate_t
+  size_t
+
+size_t mbrtoc16(char16_t* pc16, const char* s, size_t n, mbstate_t* ps);
+size_t c16rtomb(char* s, char16_t c16, mbstate_t* ps);
+size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps);
+size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps);
+
+} // std
+
+*/
+
+#include <__config>
+#include <uchar.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if !defined(_LIBCPP_CXX03_LANG)
+
+using ::mbstate_t _LIBCPP_USING_IF_EXISTS;
+using ::size_t _LIBCPP_USING_IF_EXISTS;
+
+using ::mbrtoc16 _LIBCPP_USING_IF_EXISTS;
+using ::c16rtomb _LIBCPP_USING_IF_EXISTS;
+using ::mbrtoc32 _LIBCPP_USING_IF_EXISTS;
+using ::c32rtomb _LIBCPP_USING_IF_EXISTS;
+
+#endif // _LIBCPP_CXX03_LANG
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif  // _LIBCPP_CUCHAR
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -80,7 +80,10 @@
       header "string.h"
       export *
     }
-    // FIXME: <uchar.h> is missing.
+    module uchar_h {
+      header "uchar.h"
+      export *
+    }
     // <time.h> provided by C library.
     module wchar_h {
       // <wchar.h>'s __need_* macros require textual inclusion.
@@ -203,7 +206,10 @@
       header "ctime"
       export *
     }
-    // FIXME: <cuchar> is missing.
+    module cuchar {
+      header "cuchar"
+      export *
+    }
     module cwchar {
       header "cwchar"
       export depr.stdio_h
@@ -288,6 +294,7 @@
       module pop_heap                 { private header "__algorithm/pop_heap.h" }
       module prev_permutation         { private header "__algorithm/prev_permutation.h" }
       module push_heap                { private header "__algorithm/push_heap.h" }
+      module ranges_max_element       { private header "__algorithm/ranges_max_element.h" }
       module ranges_min_element       { private header "__algorithm/ranges_min_element.h" }
       module ranges_swap_ranges       { private header "__algorithm/ranges_swap_ranges.h" }
       module remove                   { private header "__algorithm/remove.h" }
diff --git a/libcxx/include/uchar.h b/libcxx/include/uchar.h
new file mode 100644
--- /dev/null
+++ b/libcxx/include/uchar.h
@@ -0,0 +1,52 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_UCHAR_H
+#define _LIBCPP_UCHAR_H
+
+/*
+    uchar.h synopsis // since C++11
+
+Macros:
+
+    __STDC_UTF_16__
+    __STDC_UTF_32__
+
+Types:
+
+  mbstate_t
+  size_t
+
+size_t mbrtoc16(char16_t* pc16, const char* s, size_t n, mbstate_t* ps);
+size_t c16rtomb(char* s, char16_t c16, mbstate_t* ps);
+size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps);
+size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps);
+
+*/
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_CXX03_LANG)
+
+// Some platforms don't implement <uchar.h> and we don't want to give a hard
+// error on those platforms. When the platform doesn't provide <uchar.h>, at
+// least include <stddef.h> so we get the declaration for size_t.
+#  if __has_include_next(<uchar.h>)
+#    include_next <uchar.h>
+#  else
+#    include <stddef.h>
+#  endif
+
+#endif // _LIBCPP_CXX03_LANG
+
+#endif // _LIBCPP_UCHAR_H
diff --git a/libcxx/test/libcxx/clang_tidy.sh.cpp b/libcxx/test/libcxx/clang_tidy.sh.cpp
--- a/libcxx/test/libcxx/clang_tidy.sh.cpp
+++ b/libcxx/test/libcxx/clang_tidy.sh.cpp
@@ -74,6 +74,7 @@
 #include <ctgmath>
 #include <ctime>
 #include <ctype.h>
+#include <cuchar>
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 #    include <cwchar>
 #endif
@@ -188,6 +189,7 @@
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
+#include <uchar.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
diff --git a/libcxx/test/std/depr/depr.c.headers/uchar_h.pass.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/algorithm/ranges_max_element.module.verify.cpp
rename from libcxx/test/std/depr/depr.c.headers/uchar_h.pass.cpp
rename to libcxx/test/libcxx/diagnostics/detail.headers/algorithm/ranges_max_element.module.verify.cpp
--- a/libcxx/test/std/depr/depr.c.headers/uchar_h.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/detail.headers/algorithm/ranges_max_element.module.verify.cpp
@@ -5,18 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// XFAIL: suse-linux-enterprise-server-11
-// XFAIL: darwin
-// XFAIL: netbsd
-// XFAIL: LIBCXX-AIX-FIXME
-
-// <uchar.h>
 
-#include <uchar.h>
+// REQUIRES: modules-build
 
-int main(int, char**)
-{
+// WARNING: This test was generated by 'generate_private_header_tests.py'
+// and should not be edited manually.
 
-  return 0;
-}
+// expected-error@*:* {{use of private header from outside its module: '__algorithm/ranges_max_element.h'}}
+#include <__algorithm/ranges_max_element.h>
diff --git a/libcxx/test/libcxx/double_include.sh.cpp b/libcxx/test/libcxx/double_include.sh.cpp
--- a/libcxx/test/libcxx/double_include.sh.cpp
+++ b/libcxx/test/libcxx/double_include.sh.cpp
@@ -13,6 +13,9 @@
 // RUN: %{cxx} -o %t.exe %t.first.o %t.second.o %{flags} %{link_flags}
 // RUN: %{run}
 
+// The system-provided <uchar.h> seems to be broken on AIX
+// XFAIL: LIBCXX-AIX-FIXME
+
 // Prevent <ext/hash_map> from generating deprecated warnings for this test.
 #if defined(__DEPRECATED)
 #    undef __DEPRECATED
@@ -75,6 +78,7 @@
 #include <ctgmath>
 #include <ctime>
 #include <ctype.h>
+#include <cuchar>
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 #    include <cwchar>
 #endif
@@ -189,6 +193,7 @@
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
+#include <uchar.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
diff --git a/libcxx/test/libcxx/min_max_macros.compile.pass.cpp b/libcxx/test/libcxx/min_max_macros.compile.pass.cpp
--- a/libcxx/test/libcxx/min_max_macros.compile.pass.cpp
+++ b/libcxx/test/libcxx/min_max_macros.compile.pass.cpp
@@ -9,6 +9,9 @@
 // Test that headers are not tripped up by the surrounding code defining the
 // min() and max() macros.
 
+// The system-provided <uchar.h> seems to be broken on AIX
+// XFAIL: LIBCXX-AIX-FIXME
+
 // Prevent <ext/hash_map> from generating deprecated warnings for this test.
 #if defined(__DEPRECATED)
 #    undef __DEPRECATED
@@ -114,6 +117,8 @@
 TEST_MACROS();
 #include <ctype.h>
 TEST_MACROS();
+#include <cuchar>
+TEST_MACROS();
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 #    include <cwchar>
 TEST_MACROS();
@@ -296,6 +301,8 @@
 TEST_MACROS();
 #include <typeinfo>
 TEST_MACROS();
+#include <uchar.h>
+TEST_MACROS();
 #include <unordered_map>
 TEST_MACROS();
 #include <unordered_set>
diff --git a/libcxx/test/libcxx/nasty_macros.compile.pass.cpp b/libcxx/test/libcxx/nasty_macros.compile.pass.cpp
--- a/libcxx/test/libcxx/nasty_macros.compile.pass.cpp
+++ b/libcxx/test/libcxx/nasty_macros.compile.pass.cpp
@@ -9,6 +9,9 @@
 // Test that headers are not tripped up by the surrounding code defining various
 // alphabetic macros.
 
+// The system-provided <uchar.h> seems to be broken on AIX
+// XFAIL: LIBCXX-AIX-FIXME
+
 // Prevent <ext/hash_map> from generating deprecated warnings for this test.
 #if defined(__DEPRECATED)
 #    undef __DEPRECATED
@@ -185,6 +188,7 @@
 #include <ctgmath>
 #include <ctime>
 #include <ctype.h>
+#include <cuchar>
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 #    include <cwchar>
 #endif
@@ -299,6 +303,7 @@
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
+#include <uchar.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
diff --git a/libcxx/test/libcxx/no_assert_include.compile.pass.cpp b/libcxx/test/libcxx/no_assert_include.compile.pass.cpp
--- a/libcxx/test/libcxx/no_assert_include.compile.pass.cpp
+++ b/libcxx/test/libcxx/no_assert_include.compile.pass.cpp
@@ -9,6 +9,9 @@
 // Ensure that none of the standard C++ headers implicitly include cassert or
 // assert.h (because assert() is implemented as a macro).
 
+// The system-provided <uchar.h> seems to be broken on AIX
+// XFAIL: LIBCXX-AIX-FIXME
+
 // Prevent <ext/hash_map> from generating deprecated warnings for this test.
 #if defined(__DEPRECATED)
 #    undef __DEPRECATED
@@ -70,6 +73,7 @@
 #include <ctgmath>
 #include <ctime>
 #include <ctype.h>
+#include <cuchar>
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 #    include <cwchar>
 #endif
@@ -184,6 +188,7 @@
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
+#include <uchar.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
diff --git a/libcxx/test/libcxx/strings/c.strings/version_cuchar.pass.cpp b/libcxx/test/libcxx/strings/c.strings/version_cuchar.pass.cpp
--- a/libcxx/test/libcxx/strings/c.strings/version_cuchar.pass.cpp
+++ b/libcxx/test/libcxx/strings/c.strings/version_cuchar.pass.cpp
@@ -5,13 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// XFAIL: *
 
-// Skip this test on windows. If built on top of the MSVC runtime, the
-// <cuchar> header actually does exist (although not provided by us).
-// This should be removed once D97870 has landed.
-// UNSUPPORTED: windows
+// UNSUPPORTED: c++03
+
+// The system-provided <uchar.h> seems to be broken on AIX
+// XFAIL: LIBCXX-AIX-FIXME
 
 // <cuchar>
 
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.min.max/ranges.max_element.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.min.max/ranges.max_element.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.min.max/ranges.max_element.pass.cpp
@@ -0,0 +1,196 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <algorithm>
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+//  template<forward_iterator I, sentinel_for<I> S, class Proj = identity,
+//    indirect_strict_weak_order<projected<I, Proj>> Comp = ranges::less>
+//  constexpr I ranges::max_element(I first, S last, Comp comp = {}, Proj proj = {});
+//
+//  template<forward_range R, class Proj = identity,
+//    indirect_strict_weak_order<projected<iterator_t<R>, Proj>> Comp = ranges::less>
+//  constexpr borrowed_iterator_t<R> ranges::max_element(R&& r, Comp comp = {}, Proj proj = {});
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <random>
+#include <ranges>
+
+#include "test_macros.h"
+#include "test_iterators.h"
+
+template <class T>
+concept HasMaxElement = requires(T t) {
+  std::ranges::max_element(t);
+};
+
+struct NoLessThanOp {};
+struct NotTotallyOrdered {
+  int i;
+  bool operator<(const NotTotallyOrdered& o) const { return i < o.i; }
+};
+
+static_assert(HasMaxElement<std::array<int, 0>>);
+static_assert(!HasMaxElement<int>);
+static_assert(!HasMaxElement<NoLessThanOp>);
+static_assert(!HasMaxElement<NotTotallyOrdered>);
+
+template <class Iter>
+constexpr void test_iterators(Iter first, Iter last) {
+  std::same_as<Iter> auto it = std::ranges::max_element(first, last);
+  if (first != last) {
+    for (Iter j = first; j != last; ++j)
+      assert(!(*j > *it));
+  } else {
+    assert(it == first);
+  }
+}
+
+template <class Range, class Iter>
+constexpr void test_range(Range&& rng, Iter begin, Iter end) {
+  std::same_as<Iter> auto it = std::ranges::max_element(std::forward<Range>(rng));
+  if (begin != end) {
+    for (Iter j = begin; j != end; ++j)
+      assert(!(*j > *it));
+  } else {
+    assert(it == begin);
+  }
+}
+
+template <class It>
+constexpr void test(std::initializer_list<int> a, int expected) {
+  const int* first = a.begin();
+  const int* last = a.end();
+  {
+    std::same_as<It> auto it = std::ranges::max_element(It(first), It(last));
+    assert(base(it) == first + expected);
+  }
+  {
+    using Sent = sentinel_wrapper<It>;
+    std::same_as<It> auto it = std::ranges::max_element(It(first), Sent(It(last)));
+    assert(base(it) == first + expected);
+  }
+  {
+    auto range = std::ranges::subrange(It(first), It(last));
+    std::same_as<It> auto it = std::ranges::max_element(range);
+    assert(base(it) == first + expected);
+  }
+  {
+    using Sent = sentinel_wrapper<It>;
+    auto range = std::ranges::subrange(It(first), Sent(It(last)));
+    std::same_as<It> auto it = std::ranges::max_element(range);
+    assert(base(it) == first + expected);
+  }
+}
+
+template <class It>
+constexpr bool test() {
+  test<It>({}, 0);
+  test<It>({1}, 0);
+  test<It>({1, 2}, 1);
+  test<It>({2, 1}, 0);
+  test<It>({2, 1, 2}, 0);
+  test<It>({2, 1, 1}, 0);
+
+  return true;
+}
+
+constexpr void test_borrowed_range_and_sentinel() {
+  int a[] = {7, 6, 1, 3, 5, 1, 2, 4};
+
+  int* ret = std::ranges::max_element(std::views::all(a));
+  assert(ret == a + 0);
+  assert(*ret == 7);
+}
+
+constexpr void test_comparator() {
+  int a[] = {7, 6, 9, 3, 5, 1, 2, 4};
+  int* ret = std::ranges::max_element(a, std::ranges::greater{});
+  assert(ret == a + 5);
+  assert(*ret == 1);
+}
+
+constexpr void test_projection() {
+  int a[] = {7, 6, 9, 3, 5, 1, 2, 4};
+  {
+    int* ret = std::ranges::max_element(a, std::ranges::less{}, [](int i) { return i == 5 ? 100 : i; });
+    assert(ret == a + 4);
+    assert(*ret == 5);
+  }
+  {
+    int* ret = std::ranges::max_element(a, std::less<int*>{}, [](int& i) { return &i; });
+    assert(ret == a + 7);
+    assert(*ret == 4);
+  }
+}
+
+struct Immobile {
+  int i;
+
+  constexpr Immobile(int i_) : i(i_) {}
+  Immobile(const Immobile&) = delete;
+  Immobile(Immobile&&) = delete;
+
+  auto operator<=>(const Immobile&) const = default;
+};
+
+constexpr void test_immobile() {
+
+  Immobile arr[]{1, 2, 3};
+  assert(std::ranges::max_element(arr) == arr);
+  assert(std::ranges::max_element(arr, arr + 3) == arr);
+}
+
+constexpr void test_dangling() {
+  int compares = 0;
+  int projections = 0;
+  auto comparator = [&](int a, int b) {
+    ++compares;
+    return a < b;
+  };
+  auto projection = [&](int a) {
+    ++projections;
+    return a;
+  };
+  [[maybe_unused]] std::same_as<std::ranges::dangling> auto ret =
+      std::ranges::max_element(std::array{1, 2, 3}, comparator, projection);
+  assert(compares == 2);
+  assert(projections == 4);
+}
+
+constexpr bool test() {
+
+  test<forward_iterator<const int*>>();
+  test<bidirectional_iterator<const int*>>();
+  test<random_access_iterator<const int*>>();
+  test<const int*>();
+
+  int a[] = {7, 6, 5, 3, 4, 2, 1, 8};
+  test_iterators(a, a + 8);
+  int a2[] = {7, 6, 5, 3, 4, 2, 1, 8};
+  test_range(a2, a2, a2 + 8);
+
+  test_borrowed_range_and_sentinel();
+  test_comparator();
+  test_projection();
+  test_dangling();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp b/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Apple platforms don't provide <uchar.h> yet, so these tests fail.
+// XFAIL: target={{.+}}-apple-{{.+}}
+
+// The system-provided <uchar.h> seems to be broken on AIX
+// XFAIL: LIBCXX-AIX-FIXME
+
+// <uchar.h>
+
+#include <uchar.h>
+
+#include "test_macros.h"
+
+// __STDC_UTF_16__ may or may not be defined by the C standard library
+// __STDC_UTF_32__ may or may not be defined by the C standard library
+
+ASSERT_SAME_TYPE(size_t, decltype(mbrtoc16((char16_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
+ASSERT_SAME_TYPE(size_t, decltype(c16rtomb((char*)0, (char16_t)0, (mbstate_t*)0)));
+
+ASSERT_SAME_TYPE(size_t, decltype(mbrtoc32((char32_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
+ASSERT_SAME_TYPE(size_t, decltype(c16rtomb((char*)0, (char32_t)0, (mbstate_t*)0)));
diff --git a/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp b/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
--- a/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
+++ b/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
@@ -98,7 +98,7 @@
 //static_assert(test(std::ranges::lower_bound, a, 42));
 //static_assert(test(std::ranges::make_heap, a));
 //static_assert(test(std::ranges::max, a));
-//static_assert(test(std::ranges::max_element, a));
+static_assert(test(std::ranges::max_element, a));
 //static_assert(test(std::ranges::merge, a, a, a));
 //static_assert(test(std::ranges::min, a));
 static_assert(test(std::ranges::min_element, a));
diff --git a/libcxx/test/std/ranges/range.access/begin.sizezero.pass.cpp b/libcxx/test/std/ranges/range.access/begin.sizezero.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/ranges/range.access/begin.sizezero.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: msvc
+
+// std::ranges::begin
+// std::ranges::cbegin
+//   Test the fix for https://llvm.org/PR54100
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+
+struct A {
+  int m[0];
+};
+static_assert(sizeof(A) == 0); // an extension supported by GCC and Clang
+
+int main(int, char**) {
+  A a[10];
+  std::same_as<A*> auto p = std::ranges::begin(a);
+  assert(p == a);
+  std::same_as<const A*> auto cp = std::ranges::cbegin(a);
+  assert(cp == a);
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.access/end.sizezero.pass.cpp b/libcxx/test/std/ranges/range.access/end.sizezero.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/ranges/range.access/end.sizezero.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: msvc
+
+// std::ranges::end
+// std::ranges::cend
+//   Test the fix for https://llvm.org/PR54100
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+
+struct A {
+  int m[0];
+};
+static_assert(sizeof(A) == 0); // an extension supported by GCC and Clang
+
+int main(int, char**) {
+  A a[10];
+  std::same_as<A*> auto p = std::ranges::end(a);
+  assert(p == a + 10);
+  std::same_as<const A*> auto cp = std::ranges::cend(a);
+  assert(cp == a + 10);
+
+  return 0;
+}
diff --git a/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp b/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Apple platforms don't provide <uchar.h> yet, so these tests fail.
+// XFAIL: target={{.+}}-apple-{{.+}}
+
+// The system-provided <uchar.h> seems to be broken on AIX
+// XFAIL: LIBCXX-AIX-FIXME
+
+// <cuchar>
+
+#include <cuchar>
+
+#include "test_macros.h"
+
+// TODO: Implement mbrtoc8 and c8rtomb, and add tests for those
+
+// __STDC_UTF_16__ may or may not be defined by the C standard library
+// __STDC_UTF_32__ may or may not be defined by the C standard library
+
+ASSERT_SAME_TYPE(size_t, decltype(std::mbrtoc16((char16_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
+ASSERT_SAME_TYPE(size_t, decltype(std::c16rtomb((char*)0, (char16_t)0, (mbstate_t*)0)));
+
+ASSERT_SAME_TYPE(size_t, decltype(std::mbrtoc32((char32_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
+ASSERT_SAME_TYPE(size_t, decltype(std::c16rtomb((char*)0, (char32_t)0, (mbstate_t*)0)));
diff --git a/libcxx/test/std/strings/c.strings/cuchar.pass.cpp b/libcxx/test/std/strings/c.strings/cuchar.pass.cpp
deleted file mode 100644
--- a/libcxx/test/std/strings/c.strings/cuchar.pass.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// XFAIL: stdlib=libc++
-
-// Skip this test on windows. If built on top of the MSVC runtime, the
-// <cuchar> header actually does exist (although not provided by us).
-// This should be removed once D97870 has landed.
-// UNSUPPORTED: windows
-
-// <cuchar>
-
-#include <cuchar>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-
-  return 0;
-}
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.create/make_unique.sizezero.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.create/make_unique.sizezero.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.create/make_unique.sizezero.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This code triggers https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104568
+// UNSUPPORTED: gcc-11
+// UNSUPPORTED: msvc
+
+// Test the fix for https://llvm.org/PR54100
+
+#include <memory>
+#include <cassert>
+
+#include "test_macros.h"
+
+struct A {
+  int m[0];
+};
+static_assert(sizeof(A) == 0, ""); // an extension supported by GCC and Clang
+
+int main(int, char**) {
+  {
+    std::unique_ptr<A> p = std::unique_ptr<A>(new A);
+    assert(p != nullptr);
+  }
+  {
+    std::unique_ptr<A[]> p = std::unique_ptr<A[]>(new A[1]);
+    assert(p != nullptr);
+  }
+#if TEST_STD_VER > 11
+  {
+    std::unique_ptr<A> p = std::make_unique<A>();
+    assert(p != nullptr);
+  }
+  {
+    std::unique_ptr<A[]> p = std::make_unique<A[]>(1);
+    assert(p != nullptr);
+  }
+#endif
+  return 0;
+}
diff --git a/libcxx/utils/generate_header_inclusion_tests.py b/libcxx/utils/generate_header_inclusion_tests.py
--- a/libcxx/utils/generate_header_inclusion_tests.py
+++ b/libcxx/utils/generate_header_inclusion_tests.py
@@ -67,6 +67,7 @@
     "compare": "20",
     "concepts": "20",
     "coroutine": "20",
+    "cuchar": "11",
     "filesystem": "17",
     "initializer_list": "11",
     "optional": "17",
@@ -76,6 +77,7 @@
     "system_error": "11",
     "thread": "11",
     "tuple": "11",
+    "uchar.h": "11",
     "unordered_map": "11",
     "unordered_set": "11",
     "variant": "17",
diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT
--- a/llvm/CODE_OWNERS.TXT
+++ b/llvm/CODE_OWNERS.TXT
@@ -19,17 +19,13 @@
 I: arsenm
 D: InferAddressSpaces
 
-N: Simon Atanasyan
-E: simon@atanasyan.com
-D: MIPS Backend (lib/Target/Mips/*)
-
 N: Justin Bogner
 E: mail@justinbogner.com
 D: InstrProfiling and related parts of ProfileData
 D: SelectionDAG (lib/CodeGen/SelectionDAG/*)
 
 N: Alex Bradbury
-E: asb@lowrisc.org
+E: asb@asbradbury.org
 D: RISC-V backend (lib/Target/RISCV/*)
 
 N: Matthias Braun
diff --git a/llvm/CREDITS.TXT b/llvm/CREDITS.TXT
--- a/llvm/CREDITS.TXT
+++ b/llvm/CREDITS.TXT
@@ -52,7 +52,7 @@
 D: APFloat implementation.
 
 N: Alex Bradbury
-E: asb@lowrisc.org
+E: asb@asbradbury.org
 D: RISC-V backend
 
 N: Misha Brukman
diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h
--- a/llvm/include/llvm/Analysis/ConstraintSystem.h
+++ b/llvm/include/llvm/Analysis/ConstraintSystem.h
@@ -53,6 +53,11 @@
   }
 
   bool addVariableRowFill(ArrayRef<int64_t> R) {
+    // If all variable coefficients are 0, the constraint does not provide any
+    // usable information.
+    if (all_of(makeArrayRef(R).drop_front(1), [](int64_t C) { return C == 0; }))
+      return false;
+
     for (auto &CR : Constraints) {
       while (CR.size() != R.size())
         CR.push_back(0);
@@ -75,6 +80,12 @@
   bool isConditionImplied(SmallVector<int64_t, 8> R) const;
 
   void popLastConstraint() { Constraints.pop_back(); }
+  void popLastNVariables(unsigned N) {
+    for (auto &C : Constraints) {
+      for (unsigned i = 0; i < N; i++)
+        C.pop_back();
+    }
+  }
 
   /// Returns the number of rows in the constraint system.
   unsigned size() const { return Constraints.size(); }
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -67,6 +67,8 @@
 class Value;
 enum SCEVTypes : unsigned short;
 
+extern bool VerifySCEV;
+
 /// This class represents an analyzed expression in the program.  These are
 /// opaque objects that the client is not allowed to do much with directly.
 ///
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -784,7 +784,7 @@
   struct OutlineInfo {
     using PostOutlineCBTy = std::function<void(Function &)>;
     PostOutlineCBTy PostOutlineCB;
-    BasicBlock *EntryBB, *ExitBB;
+    BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
 
     /// Collect all blocks in between EntryBB and ExitBB in both the given
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1722,15 +1722,9 @@
                    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
                    [IntrNoMem]>;
   def int_ppc_fnmsub
-      : GCCBuiltin<"__builtin_ppc_fnmsub">,
-        Intrinsic <[llvm_double_ty],
-                   [llvm_double_ty, llvm_double_ty, llvm_double_ty],
-                   [IntrNoMem]>;
-  def int_ppc_fnmsubs
-      : GCCBuiltin<"__builtin_ppc_fnmsubs">,
-        Intrinsic <[llvm_float_ty],
-                   [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-                   [IntrNoMem]>;
+      : Intrinsic<[llvm_anyfloat_ty],
+                  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                  [IntrNoMem]>;
   def int_ppc_fre
       : GCCBuiltin<"__builtin_ppc_fre">,
         Intrinsic <[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -0,0 +1,99 @@
+//===- llvm/VectorBuilder.h - Builder for VP Intrinsics ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the VectorBuilder class, which is used as a convenient way
+// to create VP intrinsics as if they were LLVM instructions with a consistent
+// and simplified interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_VECTORBUILDER_H
+#define LLVM_IR_VECTORBUILDER_H
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Value.h>
+
+namespace llvm {
+
+class VectorBuilder {
+public:
+  enum class Behavior {
+    // Abort if the requested VP intrinsic could not be created.
+    // This is useful for strict consistency.
+    ReportAndAbort = 0,
+
+    // Return a default-initialized value if the requested VP intrinsic could
+    // not be created.
+    // This is useful for a defensive fallback to non-VP code.
+    SilentlyReturnNone = 1,
+  };
+
+private:
+  IRBuilder<> &Builder;
+  Behavior ErrorHandling;
+
+  // Explicit mask parameter.
+  Value *Mask;
+  // Explicit vector length parameter.
+  Value *ExplicitVectorLength;
+  // Compile-time vector length.
+  ElementCount StaticVectorLength;
+
+  // Get mask/evl value handles for the current configuration.
+  Value &requestMask();
+  Value &requestEVL();
+
+  void handleError(const char *ErrorMsg) const;
+  template <typename RetType>
+  RetType returnWithError(const char *ErrorMsg) const {
+    handleError(ErrorMsg);
+    return RetType();
+  }
+
+public:
+  VectorBuilder(IRBuilder<> &Builder,
+                Behavior ErrorHandling = Behavior::ReportAndAbort)
+      : Builder(Builder), ErrorHandling(ErrorHandling), Mask(nullptr),
+        ExplicitVectorLength(nullptr),
+        StaticVectorLength(ElementCount::getFixed(0)) {}
+
+  Module &getModule() const;
+  LLVMContext &getContext() const { return Builder.getContext(); }
+
+  // All-true mask for the currently configured explicit vector length.
+  Value *getAllTrueMask();
+
+  VectorBuilder &setMask(Value *NewMask) {
+    Mask = NewMask;
+    return *this;
+  }
+  VectorBuilder &setEVL(Value *NewExplicitVectorLength) {
+    ExplicitVectorLength = NewExplicitVectorLength;
+    return *this;
+  }
+  VectorBuilder &setStaticVL(unsigned NewFixedVL) {
+    StaticVectorLength = ElementCount::getFixed(NewFixedVL);
+    return *this;
+  }
+  // TODO: setStaticVL(ElementCount) for scalable types.
+
+  // Emit a VP intrinsic call that mimics a regular instruction.
+  // This operation behaves according to the VectorBuilderBehavior.
+  // \p Opcode      The functional instruction opcode of the emitted intrinsic.
+  // \p ReturnTy    The return type of the operation.
+  // \p VecOpArray  The operand list.
+  Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy,
+                                 ArrayRef<Value *> VecOpArray,
+                                 const Twine &Name = Twine());
+};
+
+} // namespace llvm
+
+#endif // LLVM_IR_VECTORBUILDER_H
diff --git a/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h b/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
--- a/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
+++ b/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
@@ -29,6 +29,9 @@
   // install-name-tool's id option
   Optional<StringRef> SharedLibId;
 
+  // Segments to remove if they are empty
+  DenseSet<StringRef> EmptySegmentsToRemove;
+
   // Boolean options
   bool StripSwiftSymbols = false;
   bool KeepUndefined = false;
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -199,9 +199,24 @@
                                  bool &UsedAssumedInformation,
                                  bool Intraprocedural = false);
 
+/// Collect all potential values \p LI could read into \p PotentialValues. That
+/// is, the only values read by \p LI are assumed to be known and all are in
+/// \p PotentialValues. Dependences onto \p QueryingAA are properly tracked,
+/// \p UsedAssumedInformation will inform the caller if assumed information was
+/// used.
+///
+/// \returns True if the assumed potential copies are all in \p PotentialValues,
+///          false if something went wrong and the copies could not be
+///          determined.
+bool getPotentiallyLoadedValues(Attributor &A, LoadInst &LI,
+                                SmallSetVector<Value *, 4> &PotentialValues,
+                                const AbstractAttribute &QueryingAA,
+                                bool &UsedAssumedInformation,
+                                bool OnlyExact = false);
+
 /// Collect all potential values of the one stored by \p SI into
 /// \p PotentialCopies. That is, the only copies that were made via the
-/// store are assumed to be known and all in \p PotentialCopies. Dependences
+/// store are assumed to be known and all are in \p PotentialCopies. Dependences
 /// onto \p QueryingAA are properly tracked, \p UsedAssumedInformation will
 /// inform the caller if assumed information was used.
 ///
@@ -210,7 +225,8 @@
 ///          determined.
 bool getPotentialCopiesOfStoredValue(
     Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies,
-    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation);
+    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation,
+    bool OnlyExact = false);
 
 /// Return true if \p IRP is readonly. This will query respective AAs that
 /// deduce the information and introduce dependences for \p QueryingAA.
@@ -1865,6 +1881,19 @@
   bool checkForAllReturnedValues(function_ref<bool(Value &)> Pred,
                                  const AbstractAttribute &QueryingAA);
 
+  /// Check \p Pred on all instructions in \p Fn with an opcode present in
+  /// \p Opcodes.
+  ///
+  /// This method will evaluate \p Pred on all instructions with an opcode
+  /// present in \p Opcode and return true if \p Pred holds on all of them.
+  bool checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+                               const Function *Fn,
+                               const AbstractAttribute &QueryingAA,
+                               const ArrayRef<unsigned> &Opcodes,
+                               bool &UsedAssumedInformation,
+                               bool CheckBBLivenessOnly = false,
+                               bool CheckPotentiallyDead = false);
+
   /// Check \p Pred on all instructions with an opcode present in \p Opcodes.
   ///
   /// This method will evaluate \p Pred on all instructions with an opcode
@@ -4840,21 +4869,13 @@
   virtual bool forallInterferingAccesses(
       OffsetAndSize OAS, function_ref<bool(const Access &, bool)> CB) const = 0;
 
-  /// Call \p CB on all accesses that might interfere with \p LI and return true
-  /// if all such accesses were known and the callback returned true for all of
-  /// them, false otherwise.
-  virtual bool forallInterferingAccesses(
-      LoadInst &LI, function_ref<bool(const Access &, bool)> CB) const = 0;
-  virtual bool forallInterferingAccesses(
-      StoreInst &SI, function_ref<bool(const Access &, bool)> CB) const = 0;
-
-  /// Call \p CB on all write accesses that might interfere with \p LI and
+  /// Call \p CB on all accesses that might interfere with \p I and
   /// return true if all such accesses were known and the callback returned true
   /// for all of them, false otherwise. In contrast to forallInterferingAccesses
   /// this function will perform reasoning to exclude write accesses that cannot
   /// affect the load even if they on the surface look as if they would.
-  virtual bool forallInterferingWrites(
-      Attributor &A, const AbstractAttribute &QueryingAA, LoadInst &LI,
+  virtual bool forallInterferingAccesses(
+      Attributor &A, const AbstractAttribute &QueryingAA, Instruction &I,
       function_ref<bool(const Access &, bool)> CB) const = 0;
 
   /// This function should return true if the type of the \p AA is AAPointerInfo
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -92,6 +92,11 @@
     BranchProbabilityInfo *BPI;
     AssumptionCache *AC;
 
+    // A block outside of the extraction set where any intermediate
+    // allocations will be placed inside. If this is null, allocations
+    // will be placed in the entry block of the function.
+    BasicBlock *AllocationBlock;
+
     // If true, varargs functions can be extracted.
     bool AllowVarArgs;
 
@@ -120,11 +125,15 @@
     /// code is extracted, including vastart. If AllowAlloca is true, then
     /// extraction of blocks containing alloca instructions would be possible,
     /// however code extractor won't validate whether extraction is legal.
+    /// Any new allocations will be placed in the AllocationBlock, unless
+    /// it is null, in which case it will be placed in the entry block of
+    /// the function from which the code is being extracted.
     CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
                   bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
                   BranchProbabilityInfo *BPI = nullptr,
-                  AssumptionCache *AC = nullptr,
-                  bool AllowVarArgs = false, bool AllowAlloca = false,
+                  AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
+                  bool AllowAlloca = false,
+                  BasicBlock *AllocationBlock = nullptr,
                   std::string Suffix = "");
 
     /// Create a code extractor for a loop body.
diff --git a/llvm/include/llvm/module.modulemap b/llvm/include/llvm/module.modulemap
--- a/llvm/include/llvm/module.modulemap
+++ b/llvm/include/llvm/module.modulemap
@@ -254,6 +254,7 @@
   module IR_InstrTypes { header "IR/InstrTypes.h" export * }
   module IR_Instructions { header "IR/Instructions.h" export * }
   module IR_TypeFinder { header "IR/TypeFinder.h" export * }
+  module IR_VectorBuilder { header "IR/VectorBuilder.h" export * }
 
 
   // Intrinsics.h
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -142,6 +142,12 @@
 STATISTIC(NumBruteForceTripCountsComputed,
           "Number of loops with trip counts computed by force");
 
+#ifdef EXPENSIVE_CHECKS
+bool llvm::VerifySCEV = true;
+#else
+bool llvm::VerifySCEV = false;
+#endif
+
 static cl::opt<unsigned>
 MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
                         cl::ZeroOrMore,
@@ -150,9 +156,8 @@
                                  "derived loop"),
                         cl::init(100));
 
-// FIXME: Enable this with EXPENSIVE_CHECKS when the test suite is clean.
-static cl::opt<bool> VerifySCEV(
-    "verify-scev", cl::Hidden,
+static cl::opt<bool, true> VerifySCEVOpt(
+    "verify-scev", cl::Hidden, cl::location(VerifySCEV),
     cl::desc("Verify ScalarEvolution's backedge taken counts (slow)"));
 static cl::opt<bool> VerifySCEVStrict(
     "verify-scev-strict", cl::Hidden,
@@ -526,12 +531,13 @@
 }
 
 void SCEVUnknown::allUsesReplacedWith(Value *New) {
+  // Clear this SCEVUnknown from various maps.
+  SE->forgetMemoizedResults(this);
+
   // Remove this SCEVUnknown from the uniquing map.
   SE->UniqueSCEVs.RemoveNode(this);
 
-  // Update this SCEVUnknown to point to the new value. This is needed
-  // because there may still be outstanding SCEVs which still point to
-  // this SCEVUnknown.
+  // Replace the value pointer in case someone is still using this SCEVUnknown.
   setValPtr(New);
 }
 
@@ -13358,8 +13364,14 @@
     if (!ReachableBlocks.contains(L->getHeader()))
       continue;
 
-    auto *CurBECount = SCM.visit(
-        const_cast<ScalarEvolution *>(this)->getBackedgeTakenCount(L));
+    // Only verify cached BECounts. Computing new BECounts may change the
+    // results of subsequent SCEV uses.
+    auto It = BackedgeTakenCounts.find(L);
+    if (It == BackedgeTakenCounts.end())
+      continue;
+
+    auto *CurBECount =
+        SCM.visit(It->second.getExact(L, const_cast<ScalarEvolution *>(this)));
     auto *NewBECount = SE2.getBackedgeTakenCount(L);
 
     if (CurBECount == SE2.getCouldNotCompute() ||
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -21639,9 +21639,10 @@
       SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
   if (SVT != VT.getScalarType())
     for (SDValue &Op : Ops)
-      Op = TLI.isZExtFree(Op.getValueType(), SVT)
-               ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
-               : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT);
+      Op = Op.isUndef() ? DAG.getUNDEF(SVT)
+                        : (TLI.isZExtFree(Op.getValueType(), SVT)
+                               ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
+                               : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT));
   return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
 }
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
--- a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
@@ -71,7 +71,7 @@
   // This list contains all StringCount, BucketCount pairs where BucketCount was
   // just incremented.  It ends before the first BucketCount entry where
   // BucketCount * 3 would overflow a 32-bit unsigned int.
-  static std::map<uint32_t, uint32_t> StringsToBuckets = {
+  static const std::pair<uint32_t, uint32_t> StringsToBuckets[] = {
       {0, 1},
       {1, 2},
       {2, 4},
@@ -124,8 +124,9 @@
       {517197275, 1034394550},
       {775795913, 1551591826},
       {1163693870, 2327387740}};
-  auto Entry = StringsToBuckets.lower_bound(NumStrings);
-  assert(Entry != StringsToBuckets.end());
+  const auto *Entry = llvm::lower_bound(
+      StringsToBuckets, std::make_pair(NumStrings, 0U), llvm::less_first());
+  assert(Entry != std::end(StringsToBuckets));
   return Entry->second;
 }
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -300,6 +300,7 @@
                             /* AssumptionCache */ nullptr,
                             /* AllowVarArgs */ true,
                             /* AllowAlloca */ true,
+                            /* AllocaBlock*/ OI.OuterAllocaBB,
                             /* Suffix */ ".omp_par");
 
     LLVM_DEBUG(dbgs() << "Before     outlining: " << *OuterFn << "\n");
@@ -878,6 +879,7 @@
   InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
   FiniCB(PreFiniIP);
 
+  OI.OuterAllocaBB = OuterAllocaBlock;
   OI.EntryBB = PRegEntryBB;
   OI.ExitBB = PRegExitBB;
 
@@ -901,6 +903,7 @@
                           /* AssumptionCache */ nullptr,
                           /* AllowVarArgs */ true,
                           /* AllowAlloca */ true,
+                          /* AllocationBlock */ OuterAllocaBlock,
                           /* Suffix */ ".omp_par");
 
   // Find inputs to, outputs from the code region.
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -61,6 +61,7 @@
   User.cpp
   Value.cpp
   ValueSymbolTable.cpp
+  VectorBuilder.cpp
   Verifier.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -964,6 +964,9 @@
       return Align(CI->getLimitedValue());
     }
   } else if (auto *CstPtr = dyn_cast<Constant>(this)) {
+    // Strip pointer casts to avoid creating unnecessary ptrtoint expression
+    // if the only "reduction" is combining a bitcast + ptrtoint.
+    CstPtr = CstPtr->stripPointerCasts();
     if (auto *CstInt = dyn_cast_or_null<ConstantInt>(ConstantExpr::getPtrToInt(
             const_cast<Constant *>(CstPtr), DL.getIntPtrType(getType()),
             /*OnlyIfReduced=*/true))) {
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -0,0 +1,103 @@
+//===- VectorBuilder.cpp - Builder for VP Intrinsics ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VectorBuilder class, which is used as a convenient
+// way to create VP intrinsics as if they were LLVM instructions with a
+// consistent and simplified interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/FPEnv.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/VectorBuilder.h>
+
+namespace llvm {
+
+void VectorBuilder::handleError(const char *ErrorMsg) const {
+  if (ErrorHandling == Behavior::SilentlyReturnNone)
+    return;
+  report_fatal_error(ErrorMsg);
+}
+
+Module &VectorBuilder::getModule() const {
+  return *Builder.GetInsertBlock()->getModule();
+}
+
+Value *VectorBuilder::getAllTrueMask() {
+  auto *BoolTy = Builder.getInt1Ty();
+  auto *MaskTy = VectorType::get(BoolTy, StaticVectorLength);
+  return ConstantInt::getAllOnesValue(MaskTy);
+}
+
+Value &VectorBuilder::requestMask() {
+  if (Mask)
+    return *Mask;
+
+  return *getAllTrueMask();
+}
+
+Value &VectorBuilder::requestEVL() {
+  if (ExplicitVectorLength)
+    return *ExplicitVectorLength;
+
+  assert(!StaticVectorLength.isScalable() && "TODO vscale lowering");
+  auto *IntTy = Builder.getInt32Ty();
+  return *ConstantInt::get(IntTy, StaticVectorLength.getFixedValue());
+}
+
+Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
+                                              ArrayRef<Value *> InstOpArray,
+                                              const Twine &Name) {
+  auto VPID = VPIntrinsic::getForOpcode(Opcode);
+  if (VPID == Intrinsic::not_intrinsic)
+    return returnWithError<Value *>("No VPIntrinsic for this opcode");
+
+  auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID);
+  auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID);
+  size_t NumInstParams = InstOpArray.size();
+  size_t NumVPParams =
+      NumInstParams + MaskPosOpt.hasValue() + VLenPosOpt.hasValue();
+
+  SmallVector<Value *, 6> IntrinParams;
+
+  // Whether the mask and vlen parameter are at the end of the parameter list.
+  bool TrailingMaskAndVLen =
+      std::min<size_t>(MaskPosOpt.getValueOr(NumInstParams),
+                       VLenPosOpt.getValueOr(NumInstParams)) >= NumInstParams;
+
+  if (TrailingMaskAndVLen) {
+    // Fast path for trailing mask, vector length.
+    IntrinParams.append(InstOpArray.begin(), InstOpArray.end());
+    IntrinParams.resize(NumVPParams);
+  } else {
+    IntrinParams.resize(NumVPParams);
+    // Insert mask and evl operands in between the instruction operands.
+    for (size_t VPParamIdx = 0, ParamIdx = 0; VPParamIdx < NumVPParams;
+         ++VPParamIdx) {
+      if ((MaskPosOpt && MaskPosOpt.getValueOr(NumVPParams) == VPParamIdx) ||
+          (VLenPosOpt && VLenPosOpt.getValueOr(NumVPParams) == VPParamIdx))
+        continue;
+      assert(ParamIdx < NumInstParams);
+      IntrinParams[VPParamIdx] = InstOpArray[ParamIdx++];
+    }
+  }
+
+  if (MaskPosOpt.hasValue())
+    IntrinParams[*MaskPosOpt] = &requestMask();
+  if (VLenPosOpt.hasValue())
+    IntrinParams[*VLenPosOpt] = &requestEVL();
+
+  auto *VPDecl = VPIntrinsic::getDeclarationForParams(&getModule(), VPID,
+                                                      ReturnTy, IntrinParams);
+  return Builder.CreateCall(VPDecl, IntrinParams, Name);
+}
+
+} // namespace llvm
diff --git a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
--- a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
@@ -258,6 +258,21 @@
   if (!MachOConfig.RPathToPrepend.empty())
     Obj.updateLoadCommandIndexes();
 
+  // Remove any empty segments if required.
+  if (!MachOConfig.EmptySegmentsToRemove.empty()) {
+    auto RemovePred = [&MachOConfig](const LoadCommand &LC) {
+      if (LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_SEGMENT_64 ||
+          LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_SEGMENT) {
+        return LC.Sections.empty() &&
+               MachOConfig.EmptySegmentsToRemove.contains(
+                   LC.getSegmentName().getValue());
+      }
+      return false;
+    };
+    if (Error E = Obj.removeLoadCommands(RemovePred))
+      return E;
+  }
+
   return Error::success();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -532,7 +532,11 @@
   FeaturePAuth, FeatureRCPC,
   //v8.4
   FeatureDotProd, FeatureTRACEV8_4, FeatureTLB_RMI,
-  FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO]>;
+  FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO,
+  // Not mandatory in v8.0-R, but included here on the grounds that it
+  // only enables names of system registers
+  FeatureSpecRestrict
+  ]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -968,7 +972,7 @@
                                  FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8];
   list<SubtargetFeature> R82  = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
                                  FeatureFP16FML, FeatureSSBS, FeaturePredRes,
-                                 FeatureSB, FeatureSpecRestrict];
+                                 FeatureSB];
   list<SubtargetFeature> X1   = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureRCPC, FeaturePerfMon,
                                  FeatureSPE, FeatureFullFP16, FeatureDotProd];
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9252,6 +9252,56 @@
   return true;
 }
 
+// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
+// v4i32s. This is really a truncate, which we can construct out of (legal)
+// concats and truncate nodes.
+static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
+  if (V.getValueType() != MVT::v16i8)
+    return SDValue();
+  assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
+
+  for (unsigned X = 0; X < 4; X++) {
+    // Check the first item in each group is an extract from lane 0 of a v4i32
+    // or v4i16.
+    SDValue BaseExt = V.getOperand(X * 4);
+    if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
+         BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
+        !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
+        BaseExt.getConstantOperandVal(1) != 0)
+      return SDValue();
+    SDValue Base = BaseExt.getOperand(0);
+    // And check the other items are extracts from the same vector.
+    for (unsigned Y = 1; Y < 4; Y++) {
+      SDValue Ext = V.getOperand(X * 4 + Y);
+      if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Ext.getOperand(0) != Base ||
+          !isa<ConstantSDNode>(Ext.getOperand(1)) ||
+          Ext.getConstantOperandVal(1) != Y)
+        return SDValue();
+    }
+  }
+
+  // Turn the buildvector into a series of truncates and concates, which will
+  // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
+  // concat together to produce 2 v8i16. These are both truncated and concat
+  // together.
+  SDLoc DL(V);
+  SDValue Trunc[4] = {
+      V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
+      V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
+  for (int I = 0; I < 4; I++)
+    if (Trunc[I].getValueType() == MVT::v4i32)
+      Trunc[I] = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, Trunc[I]);
+  SDValue Concat0 =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
+  SDValue Concat1 =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
+  SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
+  SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
+}
+
 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
 /// element width than the vector lane type. If that is the case the function
 /// returns true and writes the value of the DUP instruction lane operand into
@@ -10871,6 +10921,12 @@
     return SDValue();
   }
 
+  // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
+  // v4i32s. This is really a truncate, which we can construct out of (legal)
+  // concats and truncate nodes.
+  if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
+    return M;
+
   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   if (NumElts >= 4) {
     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
@@ -12799,12 +12855,15 @@
   assert(VT.isScalableVector() && "Can only lower scalable vectors");
 
   unsigned N, Opcode;
-  static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
-      {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
-      {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
-      {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
-
-  std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
+  static const std::pair<unsigned, std::pair<unsigned, unsigned>>
+      IntrinsicMap[] = {
+          {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
+          {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
+          {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
+
+  std::tie(N, Opcode) = llvm::find_if(IntrinsicMap, [&](auto P) {
+                          return P.first == Intrinsic;
+                        })->second;
   assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
          "invalid tuple vector type!");
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -737,6 +737,12 @@
   "Hardware automatically inserts waitcnt before barrier"
 >;
 
+def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier",
+  "BackOffBarrier",
+  "true",
+  "Hardware supports backing off s_barrier if an exception occurs"
+>;
+
 def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
   "HasTrigReducedRange",
   "true",
@@ -1025,7 +1031,8 @@
    FeatureMadMacF32Insts,
    FeatureSupportsSRAMECC,
    FeaturePackedTID,
-   FullRate64Ops]>;
+   FullRate64Ops,
+   FeatureBackOffBarrier]>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
   [FeatureGFX9,
@@ -1059,7 +1066,8 @@
    FeatureSupportsSRAMECC,
    FeaturePackedTID,
    FeatureArchitectedFlatScratch,
-   FullRate64Ops]>;
+   FullRate64Ops,
+   FeatureBackOffBarrier]>;
 
 // TODO: Organize more features into groups.
 def FeatureGroup {
@@ -1094,7 +1102,8 @@
      FeatureMadMacF32Insts,
      FeatureDsSrc2Insts,
      FeatureLdsMisalignedBug,
-     FeatureSupportsXNACK])>;
+     FeatureSupportsXNACK,
+     FeatureBackOffBarrier])>;
 
 def FeatureISAVersion10_1_1 : FeatureSet<
   !listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1116,7 +1125,8 @@
      FeatureMadMacF32Insts,
      FeatureDsSrc2Insts,
      FeatureLdsMisalignedBug,
-     FeatureSupportsXNACK])>;
+     FeatureSupportsXNACK,
+     FeatureBackOffBarrier])>;
 
 def FeatureISAVersion10_1_2 : FeatureSet<
   !listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1138,7 +1148,8 @@
      FeatureMadMacF32Insts,
      FeatureDsSrc2Insts,
      FeatureLdsMisalignedBug,
-     FeatureSupportsXNACK])>;
+     FeatureSupportsXNACK,
+     FeatureBackOffBarrier])>;
 
 def FeatureISAVersion10_1_3 : FeatureSet<
   !listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1156,7 +1167,8 @@
      FeatureMadMacF32Insts,
      FeatureDsSrc2Insts,
      FeatureLdsMisalignedBug,
-     FeatureSupportsXNACK])>;
+     FeatureSupportsXNACK,
+     FeatureBackOffBarrier])>;
 
 def FeatureISAVersion10_3_0 : FeatureSet<
   [FeatureGFX10,
@@ -1173,7 +1185,8 @@
    FeatureNSAEncoding,
    FeatureNSAMaxSize13,
    FeatureWavefrontSize32,
-   FeatureShaderCyclesRegister]>;
+   FeatureShaderCyclesRegister,
+   FeatureBackOffBarrier]>;
 
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -72,6 +72,7 @@
   // Dynamically set bits that enable features.
   bool FlatForGlobal;
   bool AutoWaitcntBeforeBarrier;
+  bool BackOffBarrier;
   bool UnalignedScratchAccess;
   bool UnalignedAccessMode;
   bool HasApertureRegs;
@@ -493,6 +494,12 @@
     return AutoWaitcntBeforeBarrier;
   }
 
+  /// \returns true if the target supports backing off of s_barrier instructions
+  /// when an exception is raised.
+  bool supportsBackOffBarrier() const {
+    return BackOffBarrier;
+  }
+
   bool hasUnalignedBufferAccess() const {
     return UnalignedBufferAccess;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1135,12 +1135,12 @@
     }
   }
 
-  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
-  // occurs before the instruction. Doing it here prevents any additional
-  // S_WAITCNTs from being emitted if the instruction was marked as
-  // requiring a WAITCNT beforehand.
+  // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
+  // not, we need to ensure the subtarget is capable of backing off barrier
+  // instructions in case there are any outstanding memory operations that may
+  // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
-      !ST->hasAutoWaitcntBeforeBarrier()) {
+      !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonArch.h b/llvm/lib/Target/Hexagon/HexagonArch.h
deleted file mode 100644
--- a/llvm/lib/Target/Hexagon/HexagonArch.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===- HexagonArch.h ------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
-#define LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "HexagonDepArch.h"
-#include <algorithm>
-
-namespace llvm {
-namespace Hexagon {
-
-template <class ArchCont, typename Val>
-llvm::Optional<ArchEnum> GetCpu(ArchCont const &ArchList, Val CPUString) {
-  llvm::Optional<ArchEnum> Res;
-  auto Entry = ArchList.find(CPUString);
-  if (Entry != ArchList.end())
-    Res = Entry->second;
-  return Res;
-}
-} // namespace Hexagon
-} // namespace llvm
-#endif  // LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -12,82 +12,28 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/ELF.h"
-
-#include <map>
-#include <string>
+#include "llvm/ADT/StringSwitch.h"
 
 namespace llvm {
 namespace Hexagon {
 enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67, V68, V69 };
 
-static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67, 68, 69};
-static constexpr ArrayRef<unsigned> ArchValsNum(ArchValsNumArray);
-
-static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v68", "v69" };
-static constexpr ArrayRef<StringLiteral> ArchValsText(ArchValsTextArray);
-
-static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t", "hexagonv68", "hexagonv69" };
-static constexpr ArrayRef<StringLiteral> CpuValsText(CpuValsTextArray);
-
-static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t", "v68", "v69" };
-static constexpr ArrayRef<StringLiteral> CpuNickText(CpuNickTextArray);
-
-static const std::map<std::string, ArchEnum> CpuTable{
-    {"generic", Hexagon::ArchEnum::V5},
-    {"hexagonv5", Hexagon::ArchEnum::V5},
-    {"hexagonv55", Hexagon::ArchEnum::V55},
-    {"hexagonv60", Hexagon::ArchEnum::V60},
-    {"hexagonv62", Hexagon::ArchEnum::V62},
-    {"hexagonv65", Hexagon::ArchEnum::V65},
-    {"hexagonv66", Hexagon::ArchEnum::V66},
-    {"hexagonv67", Hexagon::ArchEnum::V67},
-    {"hexagonv67t", Hexagon::ArchEnum::V67},
-    {"hexagonv68", Hexagon::ArchEnum::V68},
-    {"hexagonv69", Hexagon::ArchEnum::V69},
-};
-
-static const std::map<std::string, unsigned> ElfFlagsByCpuStr = {
-  {"generic", llvm::ELF::EF_HEXAGON_MACH_V5},
-  {"hexagonv5", llvm::ELF::EF_HEXAGON_MACH_V5},
-  {"hexagonv55", llvm::ELF::EF_HEXAGON_MACH_V55},
-  {"hexagonv60", llvm::ELF::EF_HEXAGON_MACH_V60},
-  {"hexagonv62", llvm::ELF::EF_HEXAGON_MACH_V62},
-  {"hexagonv65", llvm::ELF::EF_HEXAGON_MACH_V65},
-  {"hexagonv66", llvm::ELF::EF_HEXAGON_MACH_V66},
-  {"hexagonv67", llvm::ELF::EF_HEXAGON_MACH_V67},
-  {"hexagonv67t", llvm::ELF::EF_HEXAGON_MACH_V67T},
-  {"hexagonv68", llvm::ELF::EF_HEXAGON_MACH_V68},
-  {"hexagonv69", llvm::ELF::EF_HEXAGON_MACH_V69},
-};
-static const std::map<unsigned, std::string> ElfArchByMachFlags = {
-  {llvm::ELF::EF_HEXAGON_MACH_V5, "V5"},
-  {llvm::ELF::EF_HEXAGON_MACH_V55, "V55"},
-  {llvm::ELF::EF_HEXAGON_MACH_V60, "V60"},
-  {llvm::ELF::EF_HEXAGON_MACH_V62, "V62"},
-  {llvm::ELF::EF_HEXAGON_MACH_V65, "V65"},
-  {llvm::ELF::EF_HEXAGON_MACH_V66, "V66"},
-  {llvm::ELF::EF_HEXAGON_MACH_V67, "V67"},
-  {llvm::ELF::EF_HEXAGON_MACH_V67T, "V67T"},
-  {llvm::ELF::EF_HEXAGON_MACH_V68, "V68"},
-  {llvm::ELF::EF_HEXAGON_MACH_V69, "V69"},
-};
-static const std::map<unsigned, std::string> ElfCpuByMachFlags = {
-  {llvm::ELF::EF_HEXAGON_MACH_V5, "hexagonv5"},
-  {llvm::ELF::EF_HEXAGON_MACH_V55, "hexagonv55"},
-  {llvm::ELF::EF_HEXAGON_MACH_V60, "hexagonv60"},
-  {llvm::ELF::EF_HEXAGON_MACH_V62, "hexagonv62"},
-  {llvm::ELF::EF_HEXAGON_MACH_V65, "hexagonv65"},
-  {llvm::ELF::EF_HEXAGON_MACH_V66, "hexagonv66"},
-  {llvm::ELF::EF_HEXAGON_MACH_V67, "hexagonv67"},
-  {llvm::ELF::EF_HEXAGON_MACH_V67T, "hexagonv67t"},
-  {llvm::ELF::EF_HEXAGON_MACH_V68, "hexagonv68"},
-  {llvm::ELF::EF_HEXAGON_MACH_V69, "hexagonv69"},
-};
-
+inline Optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {
+  return StringSwitch<Optional<Hexagon::ArchEnum>>(CPU)
+      .Case("generic", Hexagon::ArchEnum::V5)
+      .Case("hexagonv5", Hexagon::ArchEnum::V5)
+      .Case("hexagonv55", Hexagon::ArchEnum::V55)
+      .Case("hexagonv60", Hexagon::ArchEnum::V60)
+      .Case("hexagonv62", Hexagon::ArchEnum::V62)
+      .Case("hexagonv65", Hexagon::ArchEnum::V65)
+      .Case("hexagonv66", Hexagon::ArchEnum::V66)
+      .Case("hexagonv67", Hexagon::ArchEnum::V67)
+      .Case("hexagonv67t", Hexagon::ArchEnum::V67)
+      .Case("hexagonv68", Hexagon::ArchEnum::V68)
+      .Case("hexagonv69", Hexagon::ArchEnum::V69)
+      .Default(None);
+}
 } // namespace Hexagon
-} // namespace llvm;
+} // namespace llvm
 
 #endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 
-#include "HexagonArch.h"
+#include "HexagonDepArch.h"
 #include "HexagonFrameLowering.h"
 #include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -95,8 +95,7 @@
 
 HexagonSubtarget &
 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
-  Optional<Hexagon::ArchEnum> ArchVer =
-      Hexagon::GetCpu(Hexagon::CpuTable, CPUString);
+  Optional<Hexagon::ArchEnum> ArchVer = Hexagon::getCpu(CPUString);
   if (ArchVer)
     HexagonArchVersion = *ArchVer;
   else
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "HexagonArch.h"
+#include "HexagonDepArch.h"
 #include "HexagonTargetStreamer.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCAsmInfo.h"
@@ -410,8 +410,8 @@
 }
 }
 
-static bool isCPUValid(const std::string &CPU) {
-  return Hexagon::CpuTable.find(CPU) != Hexagon::CpuTable.cend();
+static bool isCPUValid(StringRef CPU) {
+  return Hexagon::getCpu(CPU).hasValue();
 }
 
 namespace {
@@ -560,12 +560,18 @@
 }
 
 unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
-  using llvm::Hexagon::ElfFlagsByCpuStr;
-
-  const std::string CPU(STI.getCPU().str());
-  auto F = ElfFlagsByCpuStr.find(CPU);
-  assert(F != ElfFlagsByCpuStr.end() && "Unrecognized Architecture");
-  return F->second;
+  return StringSwitch<unsigned>(STI.getCPU())
+      .Case("generic", llvm::ELF::EF_HEXAGON_MACH_V5)
+      .Case("hexagonv5", llvm::ELF::EF_HEXAGON_MACH_V5)
+      .Case("hexagonv55", llvm::ELF::EF_HEXAGON_MACH_V55)
+      .Case("hexagonv60", llvm::ELF::EF_HEXAGON_MACH_V60)
+      .Case("hexagonv62", llvm::ELF::EF_HEXAGON_MACH_V62)
+      .Case("hexagonv65", llvm::ELF::EF_HEXAGON_MACH_V65)
+      .Case("hexagonv66", llvm::ELF::EF_HEXAGON_MACH_V66)
+      .Case("hexagonv67", llvm::ELF::EF_HEXAGON_MACH_V67)
+      .Case("hexagonv67t", llvm::ELF::EF_HEXAGON_MACH_V67T)
+      .Case("hexagonv68", llvm::ELF::EF_HEXAGON_MACH_V68)
+      .Case("hexagonv69", llvm::ELF::EF_HEXAGON_MACH_V69);
 }
 
 llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() {
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -33,7 +33,9 @@
 }
 
 def uimm2 : Operand<GRLenVT>;
-def uimm2_plus1 : Operand<GRLenVT>;
+def uimm2_plus1 : Operand<GRLenVT> {
+  let EncoderMethod = "getImmOpValueSub1";
+}
 def uimm3 : Operand<GRLenVT>;
 def uimm5 : Operand<GRLenVT>;
 def uimm6 : Operand<GRLenVT>;
@@ -41,12 +43,20 @@
 def uimm15 : Operand<GRLenVT>;
 def simm12 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isInt<12>(Imm);}]>;
 def simm14 : Operand<GRLenVT>;
-def simm14_lsl2 : Operand<GRLenVT>;
+def simm14_lsl2 : Operand<GRLenVT> {
+  let EncoderMethod = "getImmOpValueAsr2";
+}
 def simm16 : Operand<GRLenVT>;
-def simm16_lsl2 : Operand<GRLenVT>;
+def simm16_lsl2 : Operand<GRLenVT> {
+  let EncoderMethod = "getImmOpValueAsr2";
+}
 def simm20 : Operand<GRLenVT>;
-def simm21_lsl2 : Operand<GRLenVT>;
-def simm26_lsl2 : Operand<GRLenVT>;
+def simm21_lsl2 : Operand<GRLenVT> {
+  let EncoderMethod = "getImmOpValueAsr2";
+}
+def simm26_lsl2 : Operand<GRLenVT> {
+  let EncoderMethod = "getImmOpValueAsr2";
+}
 
 //===----------------------------------------------------------------------===//
 // Instruction Formats
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -51,6 +51,23 @@
   unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
+
+  /// Return binary encoding of an immediate operand specified by OpNo.
+  /// The value returned is the value of the immediate minus 1.
+  /// Note that this function is dedicated to specific immediate types,
+  /// e.g. uimm2_plus1.
+  unsigned getImmOpValueSub1(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// Return binary encoding of an immediate operand specified by OpNo.
+  /// The value returned is the value of the immediate shifted right
+  //  arithmetically by 2.
+  /// Note that this function is dedicated to specific immediate types,
+  /// e.g. simm14_lsl2, simm16_lsl2, simm21_lsl2 and simm26_lsl2.
+  unsigned getImmOpValueAsr2(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
 };
 } // end anonymous namespace
 
@@ -68,6 +85,22 @@
   llvm_unreachable("Unhandled expression!");
 }
 
+unsigned
+LoongArchMCCodeEmitter::getImmOpValueSub1(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  return MI.getOperand(OpNo).getImm() - 1;
+}
+
+unsigned
+LoongArchMCCodeEmitter::getImmOpValueAsr2(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned Res = MI.getOperand(OpNo).getImm();
+  assert((Res & 3) == 0 && "lowest 2 bits are non-zero");
+  return Res >> 2;
+}
+
 void LoongArchMCCodeEmitter::encodeInstruction(
     const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
     const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -627,6 +627,8 @@
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
 
   // To handle counter-based loop conditions.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
@@ -10549,6 +10551,16 @@
              DAG.getTargetConstant(PPC::PRED_EQ, dl, MVT::i32)}),
         0);
   }
+  case Intrinsic::ppc_fnmsub: {
+    EVT VT = Op.getOperand(1).getValueType();
+    if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))
+      return DAG.getNode(
+          ISD::FNEG, dl, VT,
+          DAG.getNode(ISD::FMA, dl, VT, Op.getOperand(1), Op.getOperand(2),
+                      DAG.getNode(ISD::FNEG, dl, VT, Op.getOperand(3))));
+    return DAG.getNode(PPCISD::FNMSUB, dl, VT, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
   case Intrinsic::ppc_convert_f128_to_ppcf128:
   case Intrinsic::ppc_convert_ppcf128_to_f128: {
     RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
@@ -11220,6 +11232,7 @@
       Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
                                     N->getOperand(2), N->getOperand(1)));
       break;
+    case Intrinsic::ppc_fnmsub:
     case Intrinsic::ppc_convert_f128_to_ppcf128:
       Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG));
       break;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3728,8 +3728,6 @@
 // XL Compat intrinsics.
 def : Pat<(int_ppc_fmsub f64:$A, f64:$B, f64:$C), (FMSUB $A, $B, $C)>;
 def : Pat<(int_ppc_fmsubs f32:$A, f32:$B, f32:$C), (FMSUBS $A, $B, $C)>;
-def : Pat<(int_ppc_fnmsub f64:$A, f64:$B, f64:$C), (FNMSUB $A, $B, $C)>;
-def : Pat<(int_ppc_fnmsubs f32:$A, f32:$B, f32:$C), (FNMSUBS $A, $B, $C)>;
 def : Pat<(int_ppc_fnmadd f64:$A, f64:$B, f64:$C), (FNMADD $A, $B, $C)>;
 def : Pat<(int_ppc_fnmadds f32:$A, f32:$B, f32:$C), (FNMADDS $A, $B, $C)>;
 def : Pat<(int_ppc_fre f64:$A), (FRE $A)>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2897,7 +2897,6 @@
 
 // XL Compat builtins.
 def : Pat<(int_ppc_fmsub f64:$A, f64:$B, f64:$C), (XSMSUBMDP $A, $B, $C)>;
-def : Pat<(int_ppc_fnmsub f64:$A, f64:$B, f64:$C), (XSNMSUBMDP $A, $B, $C)>;
 def : Pat<(int_ppc_fnmadd f64:$A, f64:$B, f64:$C), (XSNMADDMDP $A, $B, $C)>;
 def : Pat<(int_ppc_fre f64:$A), (XSREDP $A)>;
 def : Pat<(int_ppc_frsqrte vsfrc:$XB), (XSRSQRTEDP $XB)>;
@@ -3311,7 +3310,6 @@
 
 // XL Compat builtins.
 def : Pat<(int_ppc_fmsubs f32:$A, f32:$B, f32:$C), (XSMSUBMSP $A, $B, $C)>;
-def : Pat<(int_ppc_fnmsubs f32:$A, f32:$B, f32:$C), (XSNMSUBMSP $A, $B, $C)>;
 def : Pat<(int_ppc_fnmadds f32:$A, f32:$B, f32:$C), (XSNMADDMSP $A, $B, $C)>;
 def : Pat<(int_ppc_fres f32:$A), (XSRESP $A)>;
 def : Pat<(i32 (int_ppc_extract_exp f64:$A)),
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1027,45 +1027,44 @@
       }
       MVT Src1VT = Src1.getSimpleValueType();
       unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOpcode,
-          VMSetOpcode, VMANDOpcode;
+          VMOROpcode;
       switch (RISCVTargetLowering::getLMUL(Src1VT)) {
       default:
         llvm_unreachable("Unexpected LMUL!");
-#define CASE_VMSLT_VMSET_OPCODES(lmulenum, suffix, suffix_b)                   \
+#define CASE_VMSLT_OPCODES(lmulenum, suffix, suffix_b)                         \
   case RISCVII::VLMUL::lmulenum:                                               \
     VMSLTOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix                 \
                              : RISCV::PseudoVMSLT_VX_##suffix;                 \
     VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix##_MASK      \
                                  : RISCV::PseudoVMSLT_VX_##suffix##_MASK;      \
-    VMSetOpcode = RISCV::PseudoVMSET_M_##suffix_b;                             \
     break;
-        CASE_VMSLT_VMSET_OPCODES(LMUL_F8, MF8, B1)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_F4, MF4, B2)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_F2, MF2, B4)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_1, M1, B8)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_2, M2, B16)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_4, M4, B32)
-        CASE_VMSLT_VMSET_OPCODES(LMUL_8, M8, B64)
-#undef CASE_VMSLT_VMSET_OPCODES
+        CASE_VMSLT_OPCODES(LMUL_F8, MF8, B1)
+        CASE_VMSLT_OPCODES(LMUL_F4, MF4, B2)
+        CASE_VMSLT_OPCODES(LMUL_F2, MF2, B4)
+        CASE_VMSLT_OPCODES(LMUL_1, M1, B8)
+        CASE_VMSLT_OPCODES(LMUL_2, M2, B16)
+        CASE_VMSLT_OPCODES(LMUL_4, M4, B32)
+        CASE_VMSLT_OPCODES(LMUL_8, M8, B64)
+#undef CASE_VMSLT_OPCODES
       }
       // Mask operations use the LMUL from the mask type.
       switch (RISCVTargetLowering::getLMUL(VT)) {
       default:
         llvm_unreachable("Unexpected LMUL!");
-#define CASE_VMXOR_VMANDN_VMAND_OPCODES(lmulenum, suffix)                       \
+#define CASE_VMXOR_VMANDN_VMOR_OPCODES(lmulenum, suffix)                       \
   case RISCVII::VLMUL::lmulenum:                                               \
     VMXOROpcode = RISCV::PseudoVMXOR_MM_##suffix;                              \
     VMANDNOpcode = RISCV::PseudoVMANDN_MM_##suffix;                            \
-    VMANDOpcode = RISCV::PseudoVMAND_MM_##suffix;                              \
+    VMOROpcode = RISCV::PseudoVMOR_MM_##suffix;                                \
     break;
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F8, MF8)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F4, MF4)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F2, MF2)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_1, M1)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_2, M2)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_4, M4)
-        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_8, M8)
-#undef CASE_VMXOR_VMANDN_VMAND_OPCODES
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_F8, MF8)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_F4, MF4)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_F2, MF2)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_1, M1)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_2, M2)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_4, M4)
+        CASE_VMXOR_VMANDN_VMOR_OPCODES(LMUL_8, M8)
+#undef CASE_VMXOR_VMANDN_VMOR_OPCODES
       }
       SDValue SEW = CurDAG->getTargetConstant(
           Log2_32(Src1VT.getScalarSizeInBits()), DL, XLenVT);
@@ -1075,12 +1074,17 @@
       SDValue MaskedOff = Node->getOperand(1);
       SDValue Mask = Node->getOperand(4);
 
-      // If vmsgeu_mask with 0 immediate, expand it to {vmset, vmand}.
+      // If vmsgeu_mask with 0 immediate, expand it to vmor mask, maskedoff.
       if (IsCmpUnsignedZero) {
-        SDValue VMSet =
-            SDValue(CurDAG->getMachineNode(VMSetOpcode, DL, VT, VL, SEW), 0);
-        ReplaceNode(Node, CurDAG->getMachineNode(VMANDOpcode, DL, VT,
-                                                 {Mask, VMSet, VL, MaskSEW}));
+        // We don't need vmor if the MaskedOff and the Mask are the same
+        // value.
+        if (Mask == MaskedOff) {
+          ReplaceUses(Node, Mask.getNode());
+          return;
+        }
+        ReplaceNode(Node,
+                    CurDAG->getMachineNode(VMOROpcode, DL, VT,
+                                           {Mask, MaskedOff, VL, MaskSEW}));
         return;
       }
 
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -126,7 +126,7 @@
 
 // Frame indexes representing locations of CSRs which are given a fixed location
 // by save/restore libcalls.
-static const std::map<unsigned, int> FixedCSRFIMap = {
+static const std::pair<unsigned, int> FixedCSRFIMap[] = {
   {/*ra*/  RISCV::X1,   -1},
   {/*s0*/  RISCV::X8,   -2},
   {/*s1*/  RISCV::X9,   -3},
@@ -149,8 +149,9 @@
   if (!RVFI->useSaveRestoreLibCalls(MF))
     return false;
 
-  auto FII = FixedCSRFIMap.find(Reg);
-  if (FII == FixedCSRFIMap.end())
+  const auto *FII =
+      llvm::find_if(FixedCSRFIMap, [&](auto P) { return P.first == Reg; });
+  if (FII == std::end(FixedCSRFIMap))
     return false;
 
   FrameIdx = FII->second;
diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -188,6 +188,11 @@
   SDValue annotateLegalAVL(SDValue AVL) const;
   VETargetMasks getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
                                    PackElem Part) const;
+
+  // Splitting support
+  SDValue getSplitPtrOffset(SDValue Ptr, SDValue ByteStride,
+                            PackElem Part) const;
+  SDValue getSplitPtrStride(SDValue PackStride) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -155,6 +155,10 @@
     return 1;
   case VEISD::VVP_SELECT:
     return 3;
+  case VEISD::VVP_LOAD:
+    return 4;
+  case VEISD::VVP_STORE:
+    return 5;
   }
 
   return None;
@@ -431,4 +435,19 @@
   return VETargetMasks(NewMask, NewAVL);
 }
 
+SDValue VECustomDAG::getSplitPtrOffset(SDValue Ptr, SDValue ByteStride,
+                                       PackElem Part) const {
+  // High starts at base ptr but has more significant bits in the 64bit vector
+  // element.
+  if (Part == PackElem::Hi)
+    return Ptr;
+  return getNode(ISD::ADD, MVT::i64, {Ptr, ByteStride});
+}
+
+SDValue VECustomDAG::getSplitPtrStride(SDValue PackStride) const {
+  if (auto ConstBytes = dyn_cast<ConstantSDNode>(PackStride))
+    return getConstant(2 * ConstBytes->getSExtValue(), MVT::i64);
+  return getNode(ISD::SHL, MVT::i64, {PackStride, getConstant(1, MVT::i32)});
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -189,7 +189,9 @@
   SDValue lowerVVP_LOAD_STORE(SDValue Op, VECustomDAG&) const;
 
   SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue legalizeInternalLoadStoreOp(SDValue Op, VECustomDAG &CDAG) const;
   SDValue splitVectorOp(SDValue Op, VECustomDAG &CDAG) const;
+  SDValue splitPackedLoadStore(SDValue Op, VECustomDAG &CDAG) const;
   SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const;
   SDValue splitMaskArithmetic(SDValue Op, SelectionDAG &DAG) const;
   /// } VVPLowering
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -811,7 +811,7 @@
   // replace to pvfmk.w.up and pvfmk.w.lo
   // replace to pvfmk.s.up and pvfmk.s.lo
 
-  static std::map<unsigned, std::pair<unsigned, unsigned>> VFMKMap = {
+  static const std::pair<unsigned, std::pair<unsigned, unsigned>> VFMKMap[] = {
       {VE::VFMKyal, {VE::VFMKLal, VE::VFMKLal}},
       {VE::VFMKynal, {VE::VFMKLnal, VE::VFMKLnal}},
       {VE::VFMKWyvl, {VE::PVFMKWUPvl, VE::PVFMKWLOvl}},
@@ -822,8 +822,9 @@
 
   unsigned Opcode = MI.getOpcode();
 
-  auto Found = VFMKMap.find(Opcode);
-  if (Found == VFMKMap.end())
+  const auto *Found =
+      llvm::find_if(VFMKMap, [&](auto P) { return P.first == Opcode; });
+  if (Found == std::end(VFMKMap))
     report_fatal_error("unexpected opcode for pseudo vfmk");
 
   unsigned OpcodeUpper = (*Found).second.first;
diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -114,8 +114,6 @@
   auto DataVT = *getIdiomaticVectorType(Op.getNode());
   auto Packing = getTypePacking(DataVT);
 
-  assert(Packing == Packing::Normal && "TODO Packed load store isel");
-
   // TODO: Infer lower AVL from mask.
   if (!AVL)
     AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32);
@@ -150,10 +148,117 @@
                       {Chain, Data, BasePtr, StrideV, Mask, AVL});
 }
 
+SDValue VETargetLowering::splitPackedLoadStore(SDValue Op,
+                                               VECustomDAG &CDAG) const {
+  auto VVPOC = *getVVPOpcode(Op.getOpcode());
+  assert((VVPOC == VEISD::VVP_LOAD) || (VVPOC == VEISD::VVP_STORE));
+
+  MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+  assert(getTypePacking(DataVT) == Packing::Dense &&
+         "Can only split packed load/store");
+  MVT SplitDataVT = splitVectorType(DataVT);
+
+  SDValue PassThru = getNodePassthru(Op);
+  assert(!PassThru && "Should have been folded in lowering to VVP layer");
+
+  // Analyze the operation
+  SDValue PackedMask = getNodeMask(Op);
+  SDValue PackedAVL = getAnnotatedNodeAVL(Op).first;
+  SDValue PackPtr = getMemoryPtr(Op);
+  SDValue PackData = getStoredValue(Op);
+  SDValue PackStride = getLoadStoreStride(Op, CDAG);
+
+  unsigned ChainResIdx = PackData ? 0 : 1;
+
+  SDValue PartOps[2];
+
+  SDValue UpperPartAVL; // we will use this for packing things back together
+  for (PackElem Part : {PackElem::Hi, PackElem::Lo}) {
+    // VP ops already have an explicit mask and AVL. When expanding from non-VP
+    // attach those additional inputs here.
+    auto SplitTM = CDAG.getTargetSplitMask(PackedMask, PackedAVL, Part);
+
+    // Keep track of the (higher) lvl.
+    if (Part == PackElem::Hi)
+      UpperPartAVL = SplitTM.AVL;
+
+    // Attach non-predicating value operands
+    SmallVector<SDValue, 4> OpVec;
+
+    // Chain
+    OpVec.push_back(getNodeChain(Op));
+
+    // Data
+    if (PackData) {
+      SDValue PartData =
+          CDAG.getUnpack(SplitDataVT, PackData, Part, SplitTM.AVL);
+      OpVec.push_back(PartData);
+    }
+
+    // Ptr & Stride
+    // Push (ptr + ElemBytes * <Part>, 2 * ElemBytes)
+    // Stride info
+    // EVT DataVT = LegalizeVectorType(getMemoryDataVT(Op), Op, DAG, Mode);
+    OpVec.push_back(CDAG.getSplitPtrOffset(PackPtr, PackStride, Part));
+    OpVec.push_back(CDAG.getSplitPtrStride(PackStride));
+
+    // Add predicating args and generate part node
+    OpVec.push_back(SplitTM.Mask);
+    OpVec.push_back(SplitTM.AVL);
+
+    if (PackData) {
+      // Store
+      PartOps[(int)Part] = CDAG.getNode(VVPOC, MVT::Other, OpVec);
+    } else {
+      // Load
+      PartOps[(int)Part] =
+          CDAG.getNode(VVPOC, {SplitDataVT, MVT::Other}, OpVec);
+    }
+  }
+
+  // Merge the chains
+  SDValue LowChain = SDValue(PartOps[(int)PackElem::Lo].getNode(), ChainResIdx);
+  SDValue HiChain = SDValue(PartOps[(int)PackElem::Hi].getNode(), ChainResIdx);
+  SDValue FusedChains =
+      CDAG.getNode(ISD::TokenFactor, MVT::Other, {LowChain, HiChain});
+
+  // Chain only [store]
+  if (PackData)
+    return FusedChains;
+
+  // Re-pack into full packed vector result
+  MVT PackedVT =
+      getLegalVectorType(Packing::Dense, DataVT.getVectorElementType());
+  SDValue PackedVals = CDAG.getPack(PackedVT, PartOps[(int)PackElem::Lo],
+                                    PartOps[(int)PackElem::Hi], UpperPartAVL);
+
+  return CDAG.getMergeValues({PackedVals, FusedChains});
+}
+
+SDValue VETargetLowering::legalizeInternalLoadStoreOp(SDValue Op,
+                                                      VECustomDAG &CDAG) const {
+  LLVM_DEBUG(dbgs() << "::legalizeInternalLoadStoreOp\n";);
+  MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+
+  // TODO: Recognize packable load,store.
+  if (isPackedVectorType(DataVT))
+    return splitPackedLoadStore(Op, CDAG);
+
+  return legalizePackedAVL(Op, CDAG);
+}
+
 SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op,
                                                    SelectionDAG &DAG) const {
+  LLVM_DEBUG(dbgs() << "::legalizeInternalVectorOp\n";);
   VECustomDAG CDAG(DAG, Op);
 
+  // Dispatch to specialized legalization functions.
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_LOAD:
+  case VEISD::VVP_STORE:
+    return legalizeInternalLoadStoreOp(Op, CDAG);
+  }
+
   EVT IdiomVT = Op.getValueType();
   if (isPackedVectorType(IdiomVT) &&
       !supportsPackedMode(Op.getOpcode(), IdiomVT))
@@ -229,7 +334,8 @@
 
   // Half and round up EVL for 32bit element types.
   SDValue LegalAVL = AVL;
-  if (isPackedVectorType(Op.getValueType())) {
+  MVT IdiomVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+  if (isPackedVectorType(IdiomVT)) {
     assert(maySafelyIgnoreMask(Op) &&
            "TODO Shift predication from EVL into Mask");
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -103,21 +103,12 @@
 
 // Given a resume function @f.resume(%f.frame* %frame), returns the size
 // and expected alignment of %f.frame type.
-static std::pair<uint64_t, Align> getFrameLayout(Function *Resume) {
-  // Prefer to pull information from the function attributes.
+static Optional<std::pair<uint64_t, Align>> getFrameLayout(Function *Resume) {
+  // Pull information from the function attributes.
   auto Size = Resume->getParamDereferenceableBytes(0);
-  auto Align = Resume->getParamAlign(0);
-
-  // If those aren't given, extract them from the type.
-  if (Size == 0 || !Align) {
-    auto *FrameTy = Resume->arg_begin()->getType()->getPointerElementType();
-
-    const DataLayout &DL = Resume->getParent()->getDataLayout();
-    if (!Size) Size = DL.getTypeAllocSize(FrameTy);
-    if (!Align) Align = DL.getABITypeAlign(FrameTy);
-  }
-
-  return std::make_pair(Size, *Align);
+  if (!Size)
+    return None;
+  return std::make_pair(Size, Resume->getParamAlign(0).valueOrOne());
 }
 
 // Finds first non alloca instruction in the entry block of a function.
@@ -361,17 +352,19 @@
     replaceWithConstant(DestroyAddrConstant, It.second);
 
   if (ShouldElide) {
-    auto FrameSizeAndAlign = getFrameLayout(cast<Function>(ResumeAddrConstant));
-    elideHeapAllocations(CoroId->getFunction(), FrameSizeAndAlign.first,
-                         FrameSizeAndAlign.second, AA);
-    coro::replaceCoroFree(CoroId, /*Elide=*/true);
-    NumOfCoroElided++;
+    if (auto FrameSizeAndAlign =
+            getFrameLayout(cast<Function>(ResumeAddrConstant))) {
+      elideHeapAllocations(CoroId->getFunction(), FrameSizeAndAlign->first,
+                           FrameSizeAndAlign->second, AA);
+      coro::replaceCoroFree(CoroId, /*Elide=*/true);
+      NumOfCoroElided++;
 #ifndef NDEBUG
-    if (!CoroElideInfoOutputFilename.empty())
-      *getOrCreateLogFile()
-          << "Elide " << CoroId->getCoroutine()->getName() << " in "
-          << CoroId->getFunction()->getName() << "\n";
+      if (!CoroElideInfoOutputFilename.empty())
+        *getOrCreateLogFile()
+            << "Elide " << CoroId->getCoroutine()->getName() << " in "
+            << CoroId->getFunction()->getName() << "\n";
 #endif
+    }
   }
 
   return true;
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1079,7 +1079,7 @@
 
   DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar,
                          DBuilder.createExpression(), DILoc,
-                         Shape.FramePtr->getNextNode());
+                         Shape.getInsertPtAfterFramePtr());
 }
 
 // Build a struct that will keep state for an active coroutine.
@@ -1523,7 +1523,7 @@
   LLVMContext &C = CB->getContext();
   IRBuilder<> Builder(C);
   StructType *FrameTy = Shape.FrameTy;
-  Instruction *FramePtr = Shape.FramePtr;
+  Value *FramePtr = Shape.FramePtr;
   DominatorTree DT(*CB->getFunction());
   SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> DbgPtrAllocaCache;
 
@@ -1576,7 +1576,7 @@
       // For arguments, we will place the store instruction right after
       // the coroutine frame pointer instruction, i.e. bitcast of
       // coro.begin from i8* to %f.frame*.
-      InsertPt = FramePtr->getNextNode();
+      InsertPt = Shape.getInsertPtAfterFramePtr();
 
       // If we're spilling an Argument, make sure we clear 'nocapture'
       // from the coroutine function.
@@ -1593,7 +1593,7 @@
       if (!DT.dominates(CB, I)) {
         // If it is not dominated by CoroBegin, then spill should be
         // inserted immediately after CoroFrame is computed.
-        InsertPt = FramePtr->getNextNode();
+        InsertPt = Shape.getInsertPtAfterFramePtr();
       } else if (auto *II = dyn_cast<InvokeInst>(I)) {
         // If we are spilling the result of the invoke instruction, split
         // the normal edge and insert the spill in the new block.
@@ -1686,10 +1686,10 @@
     }
   }
 
-  BasicBlock *FramePtrBB = FramePtr->getParent();
+  BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent();
 
-  auto SpillBlock =
-      FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");
+  auto SpillBlock = FramePtrBB->splitBasicBlock(
+      Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB");
   SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
   Shape.AllocaSpillBlock = SpillBlock;
 
@@ -1739,7 +1739,7 @@
     for (Instruction *I : UsersToUpdate)
       I->replaceUsesOfWith(Alloca, G);
   }
-  Builder.SetInsertPoint(FramePtr->getNextNode());
+  Builder.SetInsertPoint(Shape.getInsertPtAfterFramePtr());
   for (const auto &A : FrameData.Allocas) {
     AllocaInst *Alloca = A.Alloca;
     if (A.MayWriteBeforeCoroBegin) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -128,7 +128,7 @@
   StructType *FrameTy;
   Align FrameAlign;
   uint64_t FrameSize;
-  Instruction *FramePtr;
+  Value *FramePtr;
   BasicBlock *AllocaSpillBlock;
 
   /// This would only be true if optimization are enabled.
@@ -267,6 +267,12 @@
     return nullptr;
   }
 
+  Instruction *getInsertPtAfterFramePtr() const {
+    if (auto *I = dyn_cast<Instruction>(FramePtr))
+      return I->getNextNode();
+    return &cast<Argument>(FramePtr)->getParent()->getEntryBlock().front();
+  }
+
   /// Allocate memory according to the rules of the active lowering.
   ///
   /// \param CG - if non-null, will be updated for the new call
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1152,7 +1152,8 @@
                             Function *DestroyFn, Function *CleanupFn) {
   assert(Shape.ABI == coro::ABI::Switch);
 
-  IRBuilder<> Builder(Shape.FramePtr->getNextNode());
+  IRBuilder<> Builder(Shape.getInsertPtAfterFramePtr());
+
   auto *ResumeAddr = Builder.CreateStructGEP(
       Shape.FrameTy, Shape.FramePtr, coro::Shape::SwitchFieldIndex::Resume,
       "resume.addr");
@@ -1663,7 +1664,7 @@
   // Map all uses of llvm.coro.begin to the allocated frame pointer.
   {
     // Make sure we don't invalidate Shape.FramePtr.
-    TrackingVH<Instruction> Handle(Shape.FramePtr);
+    TrackingVH<Value> Handle(Shape.FramePtr);
     Shape.CoroBegin->replaceAllUsesWith(FramePtr);
     Shape.FramePtr = Handle.getValPtr();
   }
@@ -1775,7 +1776,7 @@
   // Map all uses of llvm.coro.begin to the allocated frame pointer.
   {
     // Make sure we don't invalidate Shape.FramePtr.
-    TrackingVH<Instruction> Handle(Shape.FramePtr);
+    TrackingVH<Value> Handle(Shape.FramePtr);
     Shape.CoroBegin->replaceAllUsesWith(RawFramePtr);
     Shape.FramePtr = Handle.getValPtr();
   }
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -255,17 +255,24 @@
 
 bool AA::isValidAtPosition(const Value &V, const Instruction &CtxI,
                            InformationCache &InfoCache) {
-  if (isa<Constant>(V))
+  if (isa<Constant>(V) || &V == &CtxI)
     return true;
   const Function *Scope = CtxI.getFunction();
   if (auto *A = dyn_cast<Argument>(&V))
     return A->getParent() == Scope;
-  if (auto *I = dyn_cast<Instruction>(&V))
+  if (auto *I = dyn_cast<Instruction>(&V)) {
     if (I->getFunction() == Scope) {
-      const DominatorTree *DT =
-          InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Scope);
-      return DT && DT->dominates(I, &CtxI);
+      if (const DominatorTree *DT =
+              InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
+                  *Scope))
+        return DT->dominates(I, &CtxI);
+      // Local dominance check mostly for the old PM passes.
+      if (I->getParent() == CtxI.getParent())
+        return llvm::any_of(
+            make_range(I->getIterator(), I->getParent()->end()),
+            [&](const Instruction &AfterI) { return &AfterI == &CtxI; });
     }
+  }
   return false;
 }
 
@@ -315,22 +322,32 @@
   return nullptr;
 }
 
-bool AA::getPotentialCopiesOfStoredValue(
-    Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies,
-    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation) {
-
-  Value &Ptr = *SI.getPointerOperand();
+template <bool IsLoad, typename Ty>
+static bool
+getPotentialCopiesOfMemoryValue(Attributor &A, Ty &I,
+                                SmallSetVector<Value *, 4> &PotentialCopies,
+                                const AbstractAttribute &QueryingAA,
+                                bool &UsedAssumedInformation, bool OnlyExact) {
+  LLVM_DEBUG(dbgs() << "Trying to determine the potential copies of " << I
+                    << " (only exact: " << OnlyExact << ")\n";);
+
+  Value &Ptr = *I.getPointerOperand();
   SmallVector<Value *, 8> Objects;
-  if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, QueryingAA, &SI,
+  if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, QueryingAA, &I,
                                        UsedAssumedInformation)) {
     LLVM_DEBUG(
         dbgs() << "Underlying objects stored into could not be determined\n";);
     return false;
   }
 
+  // Containers to remember the pointer infos and new copies while we are not
+  // sure that we can find all of them. If we abort we want to avoid spurious
+  // dependences and potential copies in the provided container.
   SmallVector<const AAPointerInfo *> PIs;
   SmallVector<Value *> NewCopies;
 
+  const auto *TLI =
+      A.getInfoCache().getTargetLibraryInfoForFunction(*I.getFunction());
   for (Value *Obj : Objects) {
     LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n");
     if (isa<UndefValue>(Obj))
@@ -338,7 +355,7 @@
     if (isa<ConstantPointerNull>(Obj)) {
       // A null pointer access can be undefined but any offset from null may
       // be OK. We do not try to optimize the latter.
-      if (!NullPointerIsDefined(SI.getFunction(),
+      if (!NullPointerIsDefined(I.getFunction(),
                                 Ptr.getType()->getPointerAddressSpace()) &&
           A.getAssumedSimplified(Ptr, QueryingAA, UsedAssumedInformation) ==
               Obj)
@@ -347,8 +364,9 @@
           dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
       return false;
     }
+    // TODO: Use assumed noalias return.
     if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj) &&
-        !isNoAliasCall(Obj)) {
+        !(IsLoad ? isAllocationFn(Obj, TLI) : isNoAliasCall(Obj))) {
       LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << *Obj
                         << "\n";);
       return false;
@@ -361,23 +379,54 @@
         return false;
       }
 
+    if (IsLoad) {
+      Value *InitialValue = AA::getInitialValueForObj(*Obj, *I.getType(), TLI);
+      if (!InitialValue)
+        return false;
+      NewCopies.push_back(InitialValue);
+    }
+
     auto CheckAccess = [&](const AAPointerInfo::Access &Acc, bool IsExact) {
-      if (!Acc.isRead())
+      if ((IsLoad && !Acc.isWrite()) || (!IsLoad && !Acc.isRead()))
         return true;
-      auto *LI = dyn_cast<LoadInst>(Acc.getRemoteInst());
-      if (!LI) {
-        LLVM_DEBUG(dbgs() << "Underlying object read through a non-load "
-                             "instruction not supported yet: "
-                          << *Acc.getRemoteInst() << "\n";);
+      if (OnlyExact && !IsExact) {
+        LLVM_DEBUG(dbgs() << "Non exact access " << *Acc.getRemoteInst()
+                          << ", abort!\n");
         return false;
       }
-      NewCopies.push_back(LI);
+      if (IsLoad) {
+        assert(isa<LoadInst>(I) && "Expected load or store instruction only!");
+        if (Acc.isWrittenValueYetUndetermined())
+          return true;
+        if (!Acc.isWrittenValueUnknown()) {
+          NewCopies.push_back(Acc.getWrittenValue());
+          return true;
+        }
+        auto *SI = dyn_cast<StoreInst>(Acc.getRemoteInst());
+        if (!SI) {
+          LLVM_DEBUG(dbgs() << "Underlying object written through a non-store "
+                               "instruction not supported yet: "
+                            << *Acc.getRemoteInst() << "\n";);
+          return false;
+        }
+        NewCopies.push_back(SI->getValueOperand());
+      } else {
+        assert(isa<StoreInst>(I) && "Expected load or store instruction only!");
+        auto *LI = dyn_cast<LoadInst>(Acc.getRemoteInst());
+        if (!LI && OnlyExact) {
+          LLVM_DEBUG(dbgs() << "Underlying object read through a non-load "
+                               "instruction not supported yet: "
+                            << *Acc.getRemoteInst() << "\n";);
+          return false;
+        }
+        NewCopies.push_back(Acc.getRemoteInst());
+      }
       return true;
     };
 
     auto &PI = A.getAAFor<AAPointerInfo>(QueryingAA, IRPosition::value(*Obj),
                                          DepClassTy::NONE);
-    if (!PI.forallInterferingAccesses(SI, CheckAccess)) {
+    if (!PI.forallInterferingAccesses(A, QueryingAA, I, CheckAccess)) {
       LLVM_DEBUG(
           dbgs()
           << "Failed to verify all interfering accesses for underlying object: "
@@ -387,6 +436,9 @@
     PIs.push_back(&PI);
   }
 
+  // Only if we were successful collection all potential copies we record
+  // dependences (on non-fix AAPointerInfo AAs). We also only then modify the
+  // given PotentialCopies container.
   for (auto *PI : PIs) {
     if (!PI->getState().isAtFixpoint())
       UsedAssumedInformation = true;
@@ -397,6 +449,23 @@
   return true;
 }
 
+bool AA::getPotentiallyLoadedValues(Attributor &A, LoadInst &LI,
+                                    SmallSetVector<Value *, 4> &PotentialValues,
+                                    const AbstractAttribute &QueryingAA,
+                                    bool &UsedAssumedInformation,
+                                    bool OnlyExact) {
+  return getPotentialCopiesOfMemoryValue</* IsLoad */ true>(
+      A, LI, PotentialValues, QueryingAA, UsedAssumedInformation, OnlyExact);
+}
+
+bool AA::getPotentialCopiesOfStoredValue(
+    Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies,
+    const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation,
+    bool OnlyExact) {
+  return getPotentialCopiesOfMemoryValue</* IsLoad */ false>(
+      A, SI, PotentialCopies, QueryingAA, UsedAssumedInformation, OnlyExact);
+}
+
 static bool isAssumedReadOnlyOrReadNone(Attributor &A, const IRPosition &IRP,
                                         const AbstractAttribute &QueryingAA,
                                         bool RequireReadNone, bool &IsKnown) {
@@ -1472,30 +1541,24 @@
 }
 
 bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+                                         const Function *Fn,
                                          const AbstractAttribute &QueryingAA,
                                          const ArrayRef<unsigned> &Opcodes,
                                          bool &UsedAssumedInformation,
                                          bool CheckBBLivenessOnly,
                                          bool CheckPotentiallyDead) {
-
-  const IRPosition &IRP = QueryingAA.getIRPosition();
   // Since we need to provide instructions we have to have an exact definition.
-  const Function *AssociatedFunction = IRP.getAssociatedFunction();
-  if (!AssociatedFunction)
-    return false;
-
-  if (AssociatedFunction->isDeclaration())
+  if (!Fn || Fn->isDeclaration())
     return false;
 
   // TODO: use the function scope once we have call site AAReturnedValues.
-  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const IRPosition &QueryIRP = IRPosition::function(*Fn);
   const auto *LivenessAA =
       (CheckBBLivenessOnly || CheckPotentiallyDead)
           ? nullptr
           : &(getAAFor<AAIsDead>(QueryingAA, QueryIRP, DepClassTy::NONE));
 
-  auto &OpcodeInstMap =
-      InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
+  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn);
   if (!checkForAllInstructionsImpl(this, OpcodeInstMap, Pred, &QueryingAA,
                                    LivenessAA, Opcodes, UsedAssumedInformation,
                                    CheckBBLivenessOnly, CheckPotentiallyDead))
@@ -1504,6 +1567,19 @@
   return true;
 }
 
+bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+                                         const AbstractAttribute &QueryingAA,
+                                         const ArrayRef<unsigned> &Opcodes,
+                                         bool &UsedAssumedInformation,
+                                         bool CheckBBLivenessOnly,
+                                         bool CheckPotentiallyDead) {
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  return checkForAllInstructions(Pred, AssociatedFunction, QueryingAA, Opcodes,
+                                 UsedAssumedInformation, CheckBBLivenessOnly,
+                                 CheckPotentiallyDead);
+}
+
 bool Attributor::checkForAllReadWriteInstructions(
     function_ref<bool(Instruction &)> Pred, AbstractAttribute &QueryingAA,
     bool &UsedAssumedInformation) {
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -73,11 +73,11 @@
     cl::location(llvm::PotentialConstantIntValuesState::MaxPotentialValues),
     cl::init(7));
 
-static cl::opt<unsigned>
-    MaxInterferingWrites("attributor-max-interfering-writes", cl::Hidden,
-                         cl::desc("Maximum number of interfering writes to "
-                                  "check before assuming all might interfere."),
-                         cl::init(6));
+static cl::opt<unsigned> MaxInterferingAccesses(
+    "attributor-max-interfering-accesses", cl::Hidden,
+    cl::desc("Maximum number of interfering accesses to "
+             "check before assuming all might interfere."),
+    cl::init(6));
 
 STATISTIC(NumAAs, "Number of abstract attributes created");
 
@@ -400,6 +400,31 @@
       }
     }
 
+    if (auto *LI = dyn_cast<LoadInst>(V)) {
+      bool UsedAssumedInformation = false;
+      SmallSetVector<Value *, 4> PotentialCopies;
+      if (AA::getPotentiallyLoadedValues(A, *LI, PotentialCopies, QueryingAA,
+                                         UsedAssumedInformation,
+                                         /* OnlyExact */ true)) {
+        // Values have to be dynamically unique or we loose the fact that a
+        // single llvm::Value might represent two runtime values (e.g., stack
+        // locations in different recursive calls).
+        bool DynamicallyUnique =
+            llvm::all_of(PotentialCopies, [&A, &QueryingAA](Value *PC) {
+              return AA::isDynamicallyUnique(A, QueryingAA, *PC);
+            });
+        if (DynamicallyUnique &&
+            (!Intraprocedural || !CtxI ||
+             llvm::all_of(PotentialCopies, [CtxI](Value *PC) {
+               return AA::isValidInScope(*PC, CtxI->getFunction());
+             }))) {
+          for (auto *PotentialCopy : PotentialCopies)
+            Worklist.push_back({PotentialCopy, CtxI});
+          continue;
+        }
+      }
+    }
+
     // Once a leaf is reached we inform the user through the callback.
     if (!VisitValueCB(*V, CtxI, State, Iteration > 1)) {
       LLVM_DEBUG(dbgs() << "Generic value traversal visit callback failed for: "
@@ -440,10 +465,11 @@
   return true;
 }
 
-const Value *stripAndAccumulateMinimalOffsets(
-    Attributor &A, const AbstractAttribute &QueryingAA, const Value *Val,
-    const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
-    bool UseAssumed = false) {
+static const Value *
+stripAndAccumulateOffsets(Attributor &A, const AbstractAttribute &QueryingAA,
+                          const Value *Val, const DataLayout &DL, APInt &Offset,
+                          bool GetMinOffset, bool AllowNonInbounds,
+                          bool UseAssumed = false) {
 
   auto AttributorAnalysis = [&](Value &V, APInt &ROffset) -> bool {
     const IRPosition &Pos = IRPosition::value(V);
@@ -454,14 +480,20 @@
                                                     : DepClassTy::NONE);
     ConstantRange Range = UseAssumed ? ValueConstantRangeAA.getAssumed()
                                      : ValueConstantRangeAA.getKnown();
+    if (Range.isFullSet())
+      return false;
+
     // We can only use the lower part of the range because the upper part can
     // be higher than what the value can really be.
-    ROffset = Range.getSignedMin();
+    if (GetMinOffset)
+      ROffset = Range.getSignedMin();
+    else
+      ROffset = Range.getSignedMax();
     return true;
   };
 
   return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds,
-                                                /* AllowInvariant */ false,
+                                                /* AllowInvariant */ true,
                                                 AttributorAnalysis);
 }
 
@@ -470,8 +502,9 @@
                         const Value *Ptr, int64_t &BytesOffset,
                         const DataLayout &DL, bool AllowNonInbounds = false) {
   APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
-  const Value *Base = stripAndAccumulateMinimalOffsets(
-      A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds);
+  const Value *Base =
+      stripAndAccumulateOffsets(A, QueryingAA, Ptr, DL, OffsetAPInt,
+                                /* GetMinOffset */ true, AllowNonInbounds);
 
   BytesOffset = OffsetAPInt.getSExtValue();
   return Base;
@@ -679,7 +712,6 @@
     return clampStateAndIndicateChange(S, AA.getState());
   }
 };
-} // namespace
 
 /// Helper function to accumulate uses.
 template <class AAType, typename StateType = typename AAType::StateType>
@@ -791,6 +823,7 @@
     S += ParentState;
   }
 }
+} // namespace
 
 /// ------------------------ PointerInfo ---------------------------------------
 
@@ -1051,6 +1084,7 @@
   BooleanState BS;
 };
 
+namespace {
 struct AAPointerInfoImpl
     : public StateWrapper<AA::PointerInfo::State, AAPointerInfo> {
   using BaseTy = StateWrapper<AA::PointerInfo::State, AAPointerInfo>;
@@ -1079,22 +1113,12 @@
     return State::forallInterferingAccesses(OAS, CB);
   }
   bool forallInterferingAccesses(
-      LoadInst &LI, function_ref<bool(const AAPointerInfo::Access &, bool)> CB)
-      const override {
-    return State::forallInterferingAccesses(LI, CB);
-  }
-  bool forallInterferingAccesses(
-      StoreInst &SI, function_ref<bool(const AAPointerInfo::Access &, bool)> CB)
-      const override {
-    return State::forallInterferingAccesses(SI, CB);
-  }
-  bool forallInterferingWrites(
-      Attributor &A, const AbstractAttribute &QueryingAA, LoadInst &LI,
+      Attributor &A, const AbstractAttribute &QueryingAA, Instruction &I,
       function_ref<bool(const Access &, bool)> UserCB) const override {
     SmallPtrSet<const Access *, 8> DominatingWrites;
-    SmallVector<std::pair<const Access *, bool>, 8> InterferingWrites;
+    SmallVector<std::pair<const Access *, bool>, 8> InterferingAccesses;
 
-    Function &Scope = *LI.getFunction();
+    Function &Scope = *I.getFunction();
     const auto &NoSyncAA = A.getAAFor<AANoSync>(
         QueryingAA, IRPosition::function(Scope), DepClassTy::OPTIONAL);
     const auto *ExecDomainAA = A.lookupAAFor<AAExecutionDomain>(
@@ -1122,13 +1146,15 @@
 
     // TODO: Use inter-procedural reachability and dominance.
     const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
-        QueryingAA, IRPosition::function(*LI.getFunction()),
-        DepClassTy::OPTIONAL);
+        QueryingAA, IRPosition::function(Scope), DepClassTy::OPTIONAL);
 
-    const bool CanUseCFGResoning = CanIgnoreThreading(LI);
+    const bool FindInterferingWrites = I.mayReadFromMemory();
+    const bool FindInterferingReads = I.mayWriteToMemory();
+    const bool UseDominanceReasoning = FindInterferingWrites;
+    const bool CanUseCFGResoning = CanIgnoreThreading(I);
     InformationCache &InfoCache = A.getInfoCache();
     const DominatorTree *DT =
-        NoRecurseAA.isKnownNoRecurse()
+        NoRecurseAA.isKnownNoRecurse() && UseDominanceReasoning
             ? InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
                   Scope)
             : nullptr;
@@ -1184,33 +1210,37 @@
     }
 
     auto AccessCB = [&](const Access &Acc, bool Exact) {
-      if (!Acc.isWrite())
+      if ((!FindInterferingWrites || !Acc.isWrite()) &&
+          (!FindInterferingReads || !Acc.isRead()))
         return true;
 
       // For now we only filter accesses based on CFG reasoning which does not
       // work yet if we have threading effects, or the access is complicated.
       if (CanUseCFGResoning) {
-        if (!AA::isPotentiallyReachable(A, *Acc.getLocalInst(), LI, QueryingAA,
-                                        IsLiveInCalleeCB))
+        if ((!Acc.isWrite() ||
+             !AA::isPotentiallyReachable(A, *Acc.getLocalInst(), I, QueryingAA,
+                                         IsLiveInCalleeCB)) &&
+            (!Acc.isRead() ||
+             !AA::isPotentiallyReachable(A, I, *Acc.getLocalInst(), QueryingAA,
+                                         IsLiveInCalleeCB)))
           return true;
-        if (DT && Exact &&
-            (Acc.getLocalInst()->getFunction() == LI.getFunction()) &&
+        if (DT && Exact && (Acc.getLocalInst()->getFunction() == &Scope) &&
             IsSameThreadAsLoad(Acc)) {
-          if (DT->dominates(Acc.getLocalInst(), &LI))
+          if (DT->dominates(Acc.getLocalInst(), &I))
             DominatingWrites.insert(&Acc);
         }
       }
 
-      InterferingWrites.push_back({&Acc, Exact});
+      InterferingAccesses.push_back({&Acc, Exact});
       return true;
     };
-    if (!State::forallInterferingAccesses(LI, AccessCB))
+    if (!State::forallInterferingAccesses(I, AccessCB))
       return false;
 
     // If we cannot use CFG reasoning we only filter the non-write accesses
     // and are done here.
     if (!CanUseCFGResoning) {
-      for (auto &It : InterferingWrites)
+      for (auto &It : InterferingAccesses)
         if (!UserCB(*It.first, It.second))
           return false;
       return true;
@@ -1237,11 +1267,11 @@
       return false;
     };
 
-    // Run the user callback on all writes we cannot skip and return if that
+    // Run the user callback on all accesses we cannot skip and return if that
     // succeeded for all or not.
-    unsigned NumInterferingWrites = InterferingWrites.size();
-    for (auto &It : InterferingWrites) {
-      if (!DT || NumInterferingWrites > MaxInterferingWrites ||
+    unsigned NumInterferingAccesses = InterferingAccesses.size();
+    for (auto &It : InterferingAccesses) {
+      if (!DT || NumInterferingAccesses > MaxInterferingAccesses ||
           !CanSkipAccess(*It.first, It.second)) {
         if (!UserCB(*It.first, It.second))
           return false;
@@ -1573,7 +1603,7 @@
         LengthVal = Length->getSExtValue();
       Value &Ptr = getAssociatedValue();
       unsigned ArgNo = getIRPosition().getCallSiteArgNo();
-      ChangeStatus Changed;
+      ChangeStatus Changed = ChangeStatus::UNCHANGED;
       if (ArgNo == 0) {
         handleAccess(A, *MI, Ptr, nullptr, AccessKind::AK_WRITE, 0, Changed,
                      nullptr, LengthVal);
@@ -1616,9 +1646,11 @@
     AAPointerInfoImpl::trackPointerInfoStatistics(getIRPosition());
   }
 };
+} // namespace
 
 /// -----------------------NoUnwind Function Attribute--------------------------
 
+namespace {
 struct AANoUnwindImpl : AANoUnwind {
   AANoUnwindImpl(const IRPosition &IRP, Attributor &A) : AANoUnwind(IRP, A) {}
 
@@ -1690,9 +1722,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
 };
+} // namespace
 
 /// --------------------- Function Return Values -------------------------------
 
+namespace {
 /// "Attribute" that collects all potential returned values and the return
 /// instructions that they arise from.
 ///
@@ -1939,20 +1973,10 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {}
 };
+} // namespace
 
 /// ------------------------ NoSync Function Attribute -------------------------
 
-struct AANoSyncImpl : AANoSync {
-  AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {}
-
-  const std::string getAsStr() const override {
-    return getAssumed() ? "nosync" : "may-sync";
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
-};
-
 bool AANoSync::isNonRelaxedAtomic(const Instruction *I) {
   if (!I->isAtomic())
     return false;
@@ -1995,6 +2019,18 @@
   return false;
 }
 
+namespace {
+struct AANoSyncImpl : AANoSync {
+  AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nosync" : "may-sync";
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+};
+
 ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
 
   auto CheckRWInstForNoSync = [&](Instruction &I) {
@@ -2057,9 +2093,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
 };
+} // namespace
 
 /// ------------------------ No-Free Attributes ----------------------------
 
+namespace {
 struct AANoFreeImpl : public AANoFree {
   AANoFreeImpl(const IRPosition &IRP, Attributor &A) : AANoFree(IRP, A) {}
 
@@ -2241,8 +2279,10 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) }
 };
+} // namespace
 
 /// ------------------------ NonNull Argument Attribute ------------------------
+namespace {
 static int64_t getKnownNonNullAndDerefBytesForUse(
     Attributor &A, const AbstractAttribute &QueryingAA, Value &AssociatedValue,
     const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
@@ -2472,9 +2512,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
 };
+} // namespace
 
 /// ------------------------ No-Recurse Attributes ----------------------------
 
+namespace {
 struct AANoRecurseImpl : public AANoRecurse {
   AANoRecurseImpl(const IRPosition &IRP, Attributor &A) : AANoRecurse(IRP, A) {}
 
@@ -2550,9 +2592,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
 };
+} // namespace
 
 /// -------------------- Undefined-Behavior Attributes ------------------------
 
+namespace {
 struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
   AAUndefinedBehaviorImpl(const IRPosition &IRP, Attributor &A)
       : AAUndefinedBehavior(IRP, A) {}
@@ -2776,7 +2820,7 @@
     case Instruction::AtomicRMW:
       return !AssumedNoUBInsts.count(I);
     case Instruction::Br: {
-      auto BrInst = cast<BranchInst>(I);
+      auto *BrInst = cast<BranchInst>(I);
       if (BrInst->isUnconditional())
         return false;
       return !AssumedNoUBInsts.count(I);
@@ -2877,9 +2921,11 @@
         KnownUBInsts.size();
   }
 };
+} // namespace
 
 /// ------------------------ Will-Return Attributes ----------------------------
 
+namespace {
 // Helper function that checks whether a function has any cycle which we don't
 // know if it is bounded or not.
 // Loops with maximum trip count are considered bounded, any other cycle not.
@@ -3018,9 +3064,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
 };
+} // namespace
 
 /// -------------------AAReachability Attribute--------------------------
 
+namespace {
 struct AAReachabilityImpl : AAReachability {
   AAReachabilityImpl(const IRPosition &IRP, Attributor &A)
       : AAReachability(IRP, A) {}
@@ -3047,9 +3095,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); }
 };
+} // namespace
 
 /// ------------------------ NoAlias Argument Attribute ------------------------
 
+namespace {
 struct AANoAliasImpl : AANoAlias {
   AANoAliasImpl(const IRPosition &IRP, Attributor &A) : AANoAlias(IRP, A) {
     assert(getAssociatedType()->isPointerTy() &&
@@ -3423,9 +3473,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
 };
+} // namespace
 
 /// -------------------AAIsDead Function Attribute-----------------------
 
+namespace {
 struct AAIsDeadValueImpl : public AAIsDead {
   AAIsDeadValueImpl(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
 
@@ -3452,7 +3504,7 @@
   }
 
   /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
+  virtual const std::string getAsStr() const override {
     return isAssumedDead() ? "assumed-dead" : "assumed-live";
   }
 
@@ -3538,6 +3590,15 @@
     });
   }
 
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+    if (isa_and_nonnull<StoreInst>(I))
+      if (isValidState())
+        return "assumed-dead-store";
+    return AAIsDeadValueImpl::getAsStr();
+  }
+
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
@@ -4144,9 +4205,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {}
 };
+} // namespace
 
 /// -------------------- Dereferenceable Argument Attribute --------------------
 
+namespace {
 struct AADereferenceableImpl : AADereferenceable {
   AADereferenceableImpl(const IRPosition &IRP, Attributor &A)
       : AADereferenceable(IRP, A) {}
@@ -4265,8 +4328,9 @@
       unsigned IdxWidth =
           DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
       APInt Offset(IdxWidth, 0);
-      const Value *Base =
-          stripAndAccumulateMinimalOffsets(A, *this, &V, DL, Offset, false);
+      const Value *Base = stripAndAccumulateOffsets(
+          A, *this, &V, DL, Offset, /* GetMinOffset */ false,
+          /* AllowNonInbounds */ true);
 
       const auto &AA = A.getAAFor<AADereferenceable>(
           *this, IRPosition::value(*Base), DepClassTy::REQUIRED);
@@ -4381,9 +4445,11 @@
     STATS_DECLTRACK_CS_ATTR(dereferenceable);
   }
 };
+} // namespace
 
 // ------------------------ Align Argument Attribute ------------------------
 
+namespace {
 static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
                                     Value &AssociatedValue, const Use *U,
                                     const Instruction *I, bool &TrackUse) {
@@ -4455,13 +4521,7 @@
       takeKnownMaximum(Attr.getValueAsInt());
 
     Value &V = getAssociatedValue();
-    // TODO: This is a HACK to avoid getPointerAlignment to introduce a ptr2int
-    //       use of the function pointer. This was caused by D73131. We want to
-    //       avoid this for function pointers especially because we iterate
-    //       their uses and int2ptr is not handled. It is not a correctness
-    //       problem though!
-    if (!V.getType()->getPointerElementType()->isFunctionTy())
-      takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value());
+    takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value());
 
     if (getIRPosition().isFnInterfaceKind() &&
         (!getAnchorScope() ||
@@ -4552,6 +4612,8 @@
 
     auto VisitValueCB = [&](Value &V, const Instruction *,
                             AAAlign::StateType &T, bool Stripped) -> bool {
+      if (isa<UndefValue>(V) || isa<ConstantPointerNull>(V))
+        return true;
       const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V),
                                            DepClassTy::REQUIRED);
       if (!Stripped && this == &AA) {
@@ -4559,6 +4621,7 @@
         unsigned Alignment = 1;
         if (const Value *Base =
                 GetPointerBaseWithConstantOffset(&V, Offset, DL)) {
+          // TODO: Use AAAlign for the base too.
           Align PA = Base->getPointerAlignment(DL);
           // BasePointerAddr + Offset = Alignment * Q for some integer Q.
           // So we can say that the maximum power of two which is a divisor of
@@ -4690,8 +4753,10 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
 };
+} // namespace
 
 /// ------------------ Function No-Return Attribute ----------------------------
+namespace {
 struct AANoReturnImpl : public AANoReturn {
   AANoReturnImpl(const IRPosition &IRP, Attributor &A) : AANoReturn(IRP, A) {}
 
@@ -4759,9 +4824,11 @@
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
 };
+} // namespace
 
 /// ----------------------- Variable Capturing ---------------------------------
 
+namespace {
 /// A class to hold the state of for no-capture attributes.
 struct AANoCaptureImpl : public AANoCapture {
   AANoCaptureImpl(const IRPosition &IRP, Attributor &A) : AANoCapture(IRP, A) {}
@@ -5214,6 +5281,7 @@
     STATS_DECLTRACK_CSRET_ATTR(nocapture)
   }
 };
+} // namespace
 
 /// ------------------ Value Simplify Attribute ----------------------------
 
@@ -5234,6 +5302,7 @@
   return true;
 }
 
+namespace {
 struct AAValueSimplifyImpl : AAValueSimplify {
   AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A)
       : AAValueSimplify(IRP, A) {}
@@ -5413,7 +5482,7 @@
 
       auto &PI = A.getAAFor<AAPointerInfo>(AA, IRPosition::value(*Obj),
                                            DepClassTy::REQUIRED);
-      if (!PI.forallInterferingWrites(A, AA, L, CheckAccess))
+      if (!PI.forallInterferingAccesses(A, AA, L, CheckAccess))
         return false;
     }
     return true;
@@ -5432,15 +5501,6 @@
                  Attribute::StructRet, Attribute::Nest, Attribute::ByVal},
                 /* IgnoreSubsumingPositions */ true))
       indicatePessimisticFixpoint();
-
-    // FIXME: This is a hack to prevent us from propagating function poiner in
-    // the new pass manager CGSCC pass as it creates call edges the
-    // CallGraphUpdater cannot handle yet.
-    Value &V = getAssociatedValue();
-    if (V.getType()->isPointerTy() &&
-        V.getType()->getPointerElementType()->isFunctionTy() &&
-        !A.isModulePass())
-      indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::updateImpl(...).
@@ -5539,6 +5599,11 @@
 
   ChangeStatus manifest(Attributor &A) override {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    if (!A.isRunOn(*getAnchorScope()))
+      return Changed;
+
+    assert(!hasCallBaseContext() && "Should never manifest a simplified "
+                                    "function return with call base context!");
 
     if (auto *NewV = getReplacementValue(A)) {
       auto PredForReturned =
@@ -5869,8 +5934,10 @@
     STATS_DECLTRACK_CSARG_ATTR(value_simplify)
   }
 };
+} // namespace
 
 /// ----------------------- Heap-To-Stack Conversion ---------------------------
+namespace {
 struct AAHeapToStackFunction final : public AAHeapToStack {
 
   struct AllocationInfo {
@@ -5954,6 +6021,16 @@
         /* CheckPotentiallyDead */ true);
     (void)Success;
     assert(Success && "Did not expect the call base visit callback to fail!");
+
+    Attributor::SimplifictionCallbackTy SCB =
+        [](const IRPosition &, const AbstractAttribute *,
+           bool &) -> Optional<Value *> { return nullptr; };
+    for (const auto &It : AllocationInfos)
+      A.registerSimplificationCallback(IRPosition::callsite_returned(*It.first),
+                                       SCB);
+    for (const auto &It : DeallocationInfos)
+      A.registerSimplificationCallback(IRPosition::callsite_returned(*It.first),
+                                       SCB);
   }
 
   const std::string getAsStr() const override {
@@ -6413,8 +6490,10 @@
 
   return Changed;
 }
+} // namespace
 
 /// ----------------------- Privatizable Pointers ------------------------------
+namespace {
 struct AAPrivatizablePtrImpl : public AAPrivatizablePtr {
   AAPrivatizablePtrImpl(const IRPosition &IRP, Attributor &A)
       : AAPrivatizablePtr(IRP, A), PrivatizableType(llvm::None) {}
@@ -7013,10 +7092,12 @@
     STATS_DECLTRACK_FNRET_ATTR(privatizable_ptr);
   }
 };
+} // namespace
 
 /// -------------------- Memory Behavior Attributes ----------------------------
 /// Includes read-none, read-only, and write-only.
 /// ----------------------------------------------------------------------------
+namespace {
 struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
   AAMemoryBehaviorImpl(const IRPosition &IRP, Attributor &A)
       : AAMemoryBehavior(IRP, A) {}
@@ -7516,6 +7597,7 @@
   if (UserI->mayWriteToMemory())
     removeAssumedBits(NO_WRITES);
 }
+} // namespace
 
 /// -------------------- Memory Locations Attributes ---------------------------
 /// Includes read-none, argmemonly, inaccessiblememonly,
@@ -7549,6 +7631,7 @@
   return S;
 }
 
+namespace {
 struct AAMemoryLocationImpl : public AAMemoryLocation {
 
   AAMemoryLocationImpl(const IRPosition &IRP, Attributor &A)
@@ -8065,9 +8148,11 @@
       STATS_DECLTRACK_CS_ATTR(readnone)
   }
 };
+} // namespace
 
 /// ------------------ Value Constant Range Attribute -------------------------
 
+namespace {
 struct AAValueConstantRangeImpl : AAValueConstantRange {
   using StateType = IntegerRangeState;
   AAValueConstantRangeImpl(const IRPosition &IRP, Attributor &A)
@@ -8708,9 +8793,11 @@
     STATS_DECLTRACK_CSARG_ATTR(value_range)
   }
 };
+} // namespace
 
 /// ------------------ Potential Values Attribute -------------------------
 
+namespace {
 struct AAPotentialValuesImpl : AAPotentialValues {
   using StateType = PotentialConstantIntValuesState;
 
@@ -9895,8 +9982,10 @@
   /// This is for instruction queries than scan "forward".
   DenseMap<const Instruction *, QueryResolver> InstQueries;
 };
+} // namespace
 
 /// ---------------------- Assumption Propagation ------------------------------
+namespace {
 struct AAAssumptionInfoImpl : public AAAssumptionInfo {
   AAAssumptionInfoImpl(const IRPosition &IRP, Attributor &A,
                        const DenseSet<StringRef> &Known)
@@ -10030,6 +10119,7 @@
     return Assumptions;
   }
 };
+} // namespace
 
 AACallGraphNode *AACallEdgeIterator::operator*() const {
   return static_cast<AACallGraphNode *>(const_cast<AACallEdges *>(
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -352,7 +352,7 @@
   // TODO: Pass BFI and BPI to update profile information.
   CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
                    /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
-                   /* AllowAlloca */ false,
+                   /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
                    /* Suffix */ "cold." + std::to_string(Count));
 
   // Perform a simple cost/benefit analysis to decide whether or not to permit
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -2679,7 +2679,7 @@
       OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
-                        false, "outlined");
+                        false, nullptr, "outlined");
       findAddInputsOutputs(M, *OS, NotSame);
       if (!OS->IgnoreRegion)
         OutlinedRegions.push_back(OS);
@@ -2790,7 +2790,7 @@
       OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
-                        false, "outlined");
+                        false, nullptr, "outlined");
       bool FunctionOutlined = extractSection(*OS);
       if (FunctionOutlined) {
         unsigned StartIdx = OS->Candidate->getStartIdx();
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2966,9 +2966,15 @@
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
 
+    Attributor::SimplifictionCallbackTy SCB =
+        [](const IRPosition &, const AbstractAttribute *,
+           bool &) -> Optional<Value *> { return nullptr; };
     for (User *U : RFI.Declaration->users())
-      if (CallBase *CB = dyn_cast<CallBase>(U))
+      if (CallBase *CB = dyn_cast<CallBase>(U)) {
         MallocCalls.insert(CB);
+        A.registerSimplificationCallback(IRPosition::callsite_returned(*CB),
+                                         SCB);
+      }
 
     findPotentialRemovedFreeCalls(A);
   }
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -74,6 +74,9 @@
   }
 
   void popLastConstraint(bool Signed) { getCS(Signed).popLastConstraint(); }
+  void popLastNVariables(bool Signed, unsigned N) {
+    getCS(Signed).popLastNVariables(N);
+  }
 };
 
 /// Struct to express a pre-condition of the form %Op0 Pred %Op1.
@@ -150,9 +153,9 @@
   }
 
   if (auto *CI = dyn_cast<ConstantInt>(V)) {
-    if (CI->isNegative() || CI->uge(MaxConstraintValue))
+    if (CI->uge(MaxConstraintValue))
       return {};
-    return {{CI->getSExtValue(), nullptr}};
+    return {{CI->getZExtValue(), nullptr}};
   }
   auto *GEP = dyn_cast<GetElementPtrInst>(V);
   if (GEP && GEP->getNumOperands() == 2 && GEP->isInBounds()) {
@@ -205,8 +208,9 @@
 
   Value *Op1;
   ConstantInt *CI;
-  if (match(V, m_NUWAdd(m_Value(Op0), m_ConstantInt(CI))))
-    return {{CI->getSExtValue(), nullptr}, {1, Op0}};
+  if (match(V, m_NUWAdd(m_Value(Op0), m_ConstantInt(CI))) &&
+      !CI->uge(MaxConstraintValue))
+    return {{CI->getZExtValue(), nullptr}, {1, Op0}};
   if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative()) {
     Preconditions.emplace_back(
         CmpInst::ICMP_UGE, Op0,
@@ -371,11 +375,14 @@
   Instruction *Condition;
   bool IsNot;
   bool IsSigned = false;
+  /// Variables that can be removed from the system once the stack entry gets
+  /// removed.
+  SmallVector<Value *, 2> ValuesToRelease;
 
-  StackEntry(unsigned NumIn, unsigned NumOut, Instruction *Condition,
-             bool IsNot, bool IsSigned)
+  StackEntry(unsigned NumIn, unsigned NumOut, CmpInst *Condition, bool IsNot,
+             bool IsSigned, SmallVector<Value *, 2> ValuesToRelease)
       : NumIn(NumIn), NumOut(NumOut), Condition(Condition), IsNot(IsNot),
-        IsSigned(IsSigned) {}
+        IsSigned(IsSigned), ValuesToRelease(ValuesToRelease) {}
 };
 } // namespace
 
@@ -407,6 +414,19 @@
       continue;
     WorkList.emplace_back(DT.getNode(&BB));
 
+    // Returns true if we can add a known condition from BB to its successor
+    // block Succ. Each predecessor of Succ can either be BB or be dominated by
+    // Succ (e.g. the case when adding a condition from a pre-header to a loop
+    // header).
+    auto CanAdd = [&BB, &DT](BasicBlock *Succ) {
+      assert(isa<BranchInst>(BB.getTerminator()));
+      return any_of(successors(&BB),
+                    [Succ](const BasicBlock *S) { return S != Succ; }) &&
+             all_of(predecessors(Succ), [&BB, &DT, Succ](BasicBlock *Pred) {
+               return Pred == &BB || DT.dominates(Succ, Pred);
+             });
+    };
+
     // True as long as long as the current instruction is guaranteed to execute.
     bool GuaranteedToExecute = true;
     // Scan BB for assume calls.
@@ -425,9 +445,12 @@
           WorkList.emplace_back(DT.getNode(&BB), cast<ICmpInst>(Cond), false);
         } else {
           // Otherwise the condition only holds in the successors.
-          for (BasicBlock *Succ : successors(&BB))
+          for (BasicBlock *Succ : successors(&BB)) {
+            if (!CanAdd(Succ))
+              continue;
             WorkList.emplace_back(DT.getNode(Succ), cast<ICmpInst>(Cond),
                                   false);
+          }
         }
       }
       GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I);
@@ -437,18 +460,6 @@
     if (!Br || !Br->isConditional())
       continue;
 
-    // Returns true if we can add a known condition from BB to its successor
-    // block Succ. Each predecessor of Succ can either be BB or be dominated by
-    // Succ (e.g. the case when adding a condition from a pre-header to a loop
-    // header).
-    auto CanAdd = [&BB, &DT](BasicBlock *Succ) {
-      assert(isa<BranchInst>(BB.getTerminator()));
-      return any_of(successors(&BB),
-                    [Succ](const BasicBlock *S) { return S != Succ; }) &&
-             all_of(predecessors(Succ), [&BB, &DT, Succ](BasicBlock *Pred) {
-               return Pred == &BB || DT.dominates(Succ, Pred);
-             });
-    };
     // If the condition is an OR of 2 compares and the false successor only has
     // the current block as predecessor, queue both negated conditions for the
     // false successor.
@@ -512,8 +523,13 @@
         break;
       LLVM_DEBUG(dbgs() << "Removing " << *E.Condition << " " << E.IsNot
                         << "\n");
-      DFSInStack.pop_back();
       Info.popLastConstraint(E.IsSigned);
+      // Remove variables in the system that went out of scope.
+      auto &Mapping = Info.getValue2Index(E.IsSigned);
+      for (Value *V : E.ValuesToRelease)
+        Mapping.erase(V);
+      Info.popLastNVariables(E.IsSigned, E.ValuesToRelease.size());
+      DFSInStack.pop_back();
     }
 
     LLVM_DEBUG({
@@ -603,10 +619,6 @@
     if (!R.isValid(Info))
       continue;
 
-    for (auto &KV : NewIndices)
-      Info.getValue2Index(CmpInst::isSigned(CB.Condition->getPredicate()))
-          .insert(KV);
-
     LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n");
     bool Added = false;
     assert(CmpInst::isSigned(CB.Condition->getPredicate()) == R.IsSigned &&
@@ -620,8 +632,11 @@
     // If R has been added to the system, queue it for removal once it goes
     // out-of-scope.
     if (Added) {
-      for (auto &KV : NewIndices)
+      SmallVector<Value *, 2> ValuesToRelease;
+      for (auto &KV : NewIndices) {
         Info.getValue2Index(R.IsSigned).insert(KV);
+        ValuesToRelease.push_back(KV.first);
+      }
 
       LLVM_DEBUG({
         dbgs() << "  constraint: ";
@@ -629,7 +644,7 @@
       });
 
       DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not,
-                              R.IsSigned);
+                              R.IsSigned, ValuesToRelease);
 
       if (R.IsEq) {
         // Also add the inverted constraint for equality constraints.
@@ -638,7 +653,7 @@
         CSToUse.addVariableRowFill(R.Coefficients);
 
         DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not,
-                                R.IsSigned);
+                                R.IsSigned, SmallVector<Value *, 2>());
       }
     }
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -309,12 +309,12 @@
 
 #ifndef NDEBUG
     // LoopAnalysisResults should always be valid.
-    // Note that we don't LAR.SE.verify() because that can change observed SE
-    // queries. See PR44815.
     if (VerifyDomInfo)
       LAR.DT.verify();
     if (VerifyLoopInfo)
       LAR.LI.verify(LAR.DT);
+    if (VerifySCEV)
+      LAR.SE.verify();
     if (LAR.MSSA && VerifyMemorySSA)
       LAR.MSSA->verifyMemorySSA();
 #endif
diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp
--- a/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -319,6 +319,8 @@
   // on B (A appears after B), A needs to be sinked first before B can be
   // sinked.
   for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
+    if (isa<PHINode>(&I))
+      continue;
     // No need to check for instruction's operands are loop invariant.
     assert(L.hasLoopInvariantOperands(&I) &&
            "Insts in a loop's preheader should have loop invariant operands!");
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -246,9 +246,10 @@
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
                              BranchProbabilityInfo *BPI, AssumptionCache *AC,
                              bool AllowVarArgs, bool AllowAlloca,
-                             std::string Suffix)
+                             BasicBlock *AllocationBlock, std::string Suffix)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
-      BPI(BPI), AC(AC), AllowVarArgs(AllowVarArgs),
+      BPI(BPI), AC(AC), AllocationBlock(AllocationBlock),
+      AllowVarArgs(AllowVarArgs),
       Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
       Suffix(Suffix) {}
 
@@ -257,7 +258,7 @@
                              BranchProbabilityInfo *BPI, AssumptionCache *AC,
                              std::string Suffix)
     : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
-      BPI(BPI), AC(AC), AllowVarArgs(false),
+      BPI(BPI), AC(AC), AllocationBlock(nullptr), AllowVarArgs(false),
       Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
                                      /* AllowVarArgs */ false,
                                      /* AllowAlloca */ false)),
@@ -1189,9 +1190,10 @@
 
     // Allocate a struct at the beginning of this function
     StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
-    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
-                            "structArg",
-                            &codeReplacer->getParent()->front().front());
+    Struct = new AllocaInst(
+        StructArgTy, DL.getAllocaAddrSpace(), nullptr, "structArg",
+        AllocationBlock ? &*AllocationBlock->getFirstInsertionPt()
+                        : &codeReplacer->getParent()->front().front());
     params.push_back(Struct);
 
     // Store aggregated inputs in the struct.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10356,8 +10356,8 @@
   const std::string DebugLocStr = getDebugLocString(L);
 #endif /* NDEBUG */
 
-  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
-                    << L->getHeader()->getParent()->getName() << "\" from "
+  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
+                    << L->getHeader()->getParent()->getName() << "' from "
                     << DebugLocStr << "\n");
 
   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith.ll b/llvm/test/Analysis/CostModel/AArch64/arith.ll
--- a/llvm/test/Analysis/CostModel/AArch64/arith.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -cost-model -analyze -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: opt -passes='print<cost-model>' -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64-linux-gnu < %s | FileCheck %s
 
 define void @i1() {
 ; CHECK-LABEL: 'i1'
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -cost-model -analyze -mtriple=aarch64-none-linux-gnueabi %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
-; RUN: opt -cost-model -analyze -mtriple=aarch64-none-linux-gnueabi -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
 
 define void @ext() {
 ; CHECK-LABEL: 'ext'
diff --git a/llvm/test/Analysis/CostModel/AArch64/min-max.ll b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
--- a/llvm/test/Analysis/CostModel/AArch64/min-max.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/min-max.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -cost-model -analyze -cost-kind=throughput | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
-; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+fullfp16 -cost-model -analyze -cost-kind=throughput | FileCheck %s --check-prefixes=CHECK,CHECK-F16
+; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -passes='print<cost-model>' 2>&1 -disable-output -cost-kind=throughput | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
+; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+fullfp16 -passes='print<cost-model>' 2>&1 -disable-output -cost-kind=throughput | FileCheck %s --check-prefixes=CHECK,CHECK-F16
 
 define void @umin() {
 ; CHECK-LABEL: 'umin'
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i16] zeroinitializer, align 128
 @C = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i16, i16* %inB, align 2
 ; SSE2: LV: Found an estimated cost of 24 for VF 2 For instruction:   %valB = load i16, i16* %inB, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i32] zeroinitializer, align 128
 @C = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i32, i32* %inB, align 4
 ; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i32, i32* %inB, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i64] zeroinitializer, align 128
 @C = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i64, i64* %inB, align 8
 ; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i64, i64* %inB, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i8] zeroinitializer, align 128
 @C = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB = load i8, i8* %inB, align 1
 ; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction:   %valB = load i8, i8* %inB, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x float] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x float] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x float] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x float] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x double] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x double] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x double] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x double] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2-indices-0u.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-01u.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-012u.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-0uuu.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
 ; SSE2: LV: Found an estimated cost of 42 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i64] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i64] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i64] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i64] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
 ; SSE2: LV: Found an estimated cost of 42 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; SSE2: LV: Found an estimated cost of 23 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; SSE2: LV: Found an estimated cost of 47 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x float] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v1, float* %out1, align 4
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store float %v1, float* %out1, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x float] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v2, float* %out2, align 4
 ; SSE2: LV: Found an estimated cost of 11 for VF 2 For instruction:   store float %v2, float* %out2, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x float] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v3, float* %out3, align 4
 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   store float %v3, float* %out3, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x float] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v5, float* %out5, align 4
 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   store float %v5, float* %out5, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x double] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v1, double* %out1, align 8
 ; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store double %v1, double* %out1, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x double] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v2, double* %out2, align 8
 ; SSE2: LV: Found an estimated cost of 10 for VF 2 For instruction:   store double %v2, double* %out2, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x double] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   store double %v3, double* %out3, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x double] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v5, double* %out5, align 8
 ; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction:   store double %v5, double* %out5, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
 ; SSE2: LV: Found an estimated cost of 16 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
 ; SSE2: LV: Found an estimated cost of 17 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
 ; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
 ; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v1, i32* %out1, align 4
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i32 %v1, i32* %out1, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v2, i32* %out2, align 4
 ; SSE2: LV: Found an estimated cost of 23 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v3, i32* %out3, align 4
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v5, i32* %out5, align 4
 ; SSE2: LV: Found an estimated cost of 45 for VF 2 For instruction:   store i32 %v5, i32* %out5, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v1, i64* %out1, align 8
 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v2, i64* %out2, align 8
 ; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction:   store i64 %v2, i64* %out2, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v5, i64* %out5, align 8
 ; SSE2: LV: Found an estimated cost of 44 for VF 2 For instruction:   store i64 %v5, i64* %out5, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v2, i8* %out2, align 1
 ; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction:   store i8 %v2, i8* %out2, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v3, i8* %out3, align 1
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i8 %v3, i8* %out3, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll
@@ -10,7 +10,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v5, i8* %out5, align 1
 ; SSE2: LV: Found an estimated cost of 49 for VF 2 For instruction:   store i8 %v5, i8* %out5, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i32] zeroinitializer, align 128
 @C = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i64] zeroinitializer, align 128
 @C = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll
@@ -17,7 +17,7 @@
 ;     }
 ; (relates to the testcase in PR50566)
 
-; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test1"
+; DISABLED_MASKED_STRIDED: LV: Checking a loop in 'test1'
 ;
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
@@ -34,7 +34,7 @@
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 62 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 62 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 
-; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test1"
+; ENABLED_MASKED_STRIDED: LV: Checking a loop in 'test1'
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
@@ -84,7 +84,7 @@
 ;       y[i] = points[i*4 + 1];
 ;     }
 
-; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test2"
+; DISABLED_MASKED_STRIDED: LV: Checking a loop in 'test2'
 ;
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
@@ -101,7 +101,7 @@
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 
-; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test2"
+; ENABLED_MASKED_STRIDED: LV: Checking a loop in 'test2'
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
@@ -161,7 +161,7 @@
 ;         x[i] = points[i*3];
 ;     }
 
-; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test"
+; DISABLED_MASKED_STRIDED: LV: Checking a loop in 'test'
 ;
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
@@ -169,7 +169,7 @@
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 
-; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test"
+; ENABLED_MASKED_STRIDED: LV: Checking a loop in 'test'
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 7 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
@@ -17,7 +17,7 @@
 ;     }
 ; (relates to the testcase in PR50566)
 
-; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test1"
+; DISABLED_MASKED_STRIDED: LV: Checking a loop in 'test1'
 ;
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %0, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %2, i16* %arrayidx7, align 2
@@ -34,7 +34,7 @@
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 68 for VF 16 For instruction:   store i16 %0, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 68 for VF 16 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 
-; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test1"
+; ENABLED_MASKED_STRIDED: LV: Checking a loop in 'test1'
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %0, i16* %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %2, i16* %arrayidx7, align 2
@@ -84,7 +84,7 @@
 ;       points[i*4 + 1] = y[i];
 ;     }
 
-; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test2"
+; DISABLED_MASKED_STRIDED: LV: Checking a loop in 'test2'
 ;
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %0, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %2, i16* %arrayidx7, align 2
@@ -101,7 +101,7 @@
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 50 for VF 16 For instruction:   store i16 %0, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 
-; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test2"
+; ENABLED_MASKED_STRIDED: LV: Checking a loop in 'test2'
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %0, i16* %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %2, i16* %arrayidx7, align 2
@@ -161,7 +161,7 @@
 ;         points[i*3] = x[i];
 ;     }
 
-; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test"
+; DISABLED_MASKED_STRIDED: LV: Checking a loop in 'test'
 ;
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %0, i16* %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %0, i16* %arrayidx6, align 2
@@ -169,7 +169,7 @@
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   store i16 %0, i16* %arrayidx6, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction:   store i16 %0, i16* %arrayidx6, align 2
 
-; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test"
+; ENABLED_MASKED_STRIDED: LV: Checking a loop in 'test'
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %0, i16* %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %0, i16* %arrayidx6, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll
@@ -13,7 +13,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @C = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
 ; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/masked-load-i32.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i32.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-load-i32.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-load-i32.ll
@@ -13,7 +13,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @C = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/masked-load-i64.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i64.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-load-i64.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-load-i64.ll
@@ -13,7 +13,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @C = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll
@@ -13,7 +13,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @C = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
 ; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i32] zeroinitializer, align 128
 @C = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i64] zeroinitializer, align 128
 @C = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/masked-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-store-i16.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-store-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-store-i16.ll
@@ -13,7 +13,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/masked-store-i32.ll b/llvm/test/Analysis/CostModel/X86/masked-store-i32.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-store-i32.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-store-i32.ll
@@ -13,7 +13,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/masked-store-i64.ll b/llvm/test/Analysis/CostModel/X86/masked-store-i64.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-store-i64.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-store-i64.ll
@@ -13,7 +13,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/masked-store-i8.ll b/llvm/test/Analysis/CostModel/X86/masked-store-i8.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-store-i8.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-store-i8.ll
@@ -13,7 +13,7 @@
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
 ; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i16] zeroinitializer, align 128
 @C = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %valB, i16* %out, align 2
 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i16 %valB, i16* %out, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i32] zeroinitializer, align 128
 @C = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %valB, i32* %out, align 4
 ; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i32 %valB, i32* %out, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i64] zeroinitializer, align 128
 @C = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %valB, i64* %out, align 8
 ; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i64 %valB, i64* %out, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll
@@ -14,7 +14,7 @@
 @B = global [1024 x i8] zeroinitializer, align 128
 @C = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %valB, i8* %out, align 1
 ; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i8 %valB, i8* %out, align 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
--- a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
 
 
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK: movi.16b v[[REG0:[0-9]+]], #0
 define <8 x i1> @test1() {
 entry:
@@ -14,16 +14,16 @@
   ret <8 x i1> %Shuff
 }
 
-; CHECK: lCPI1_0:
-; CHECK:          .byte   0                       ; 0x0
-; CHECK:          .byte   0                       ; 0x0
+; CHECK-LABEL: lCPI1_0:
 ; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .space  1
 ; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .space  1
 ; CHECK:          .byte   1                       ; 0x1
 ; CHECK:          .byte   0                       ; 0x0
 ; CHECK:          .byte   0                       ; 0x0
 ; CHECK:          .byte   0                       ; 0x0
-; CHECK: test2
+; CHECK-LABEL: test2
 ; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_0@PAGE
 ; CHECK: ldr     d[[REG1:[0-9]+]], [x[[REG2]], lCPI1_0@PAGEOFF]
 define <8 x i1>@test2() {
@@ -35,7 +35,7 @@
   ret <8 x i1> %Shuff
 }
 
-; CHECK: test3
+; CHECK-LABEL: test3
 ; CHECK: movi.4s v{{[0-9]+}}, #1
 define <16 x i1> @test3(i1* %ptr, i32 %v) {
 bb:
@@ -45,7 +45,7 @@
                  i32 14, i32 0>
   ret <16 x i1> %Shuff
 }
-; CHECK: lCPI3_0:
+; CHECK-LABEL: lCPI3_0:
 ; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   0                       ; 0x0
@@ -62,7 +62,7 @@
 ; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   0                       ; 0x0
-; CHECK: _test4:
+; CHECK-LABEL: _test4:
 ; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_0@PAGE
 ; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF]
 define <16 x i1> @test4(i1* %ptr, i32 %v) {
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -3004,55 +3004,22 @@
 ; CHECK-LABEL: test_signed_v16f32_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v4.4s, #127
+; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
+; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    mvni v5.4s, #127
-; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    smin v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    smin v3.4s, v3.4s, v4.4s
 ; CHECK-NEXT:    smin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    smax v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    smax v2.4s, v2.4s, v5.4s
-; CHECK-NEXT:    xtn v6.4h, v0.4s
-; CHECK-NEXT:    umov w8, v6.h[0]
-; CHECK-NEXT:    umov w9, v6.h[1]
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    umov w8, v6.h[2]
-; CHECK-NEXT:    mov v0.b[1], w9
-; CHECK-NEXT:    mov v0.b[2], w8
-; CHECK-NEXT:    umov w8, v6.h[3]
-; CHECK-NEXT:    mov v0.b[3], w8
-; CHECK-NEXT:    umov w8, v1.h[0]
-; CHECK-NEXT:    mov v0.b[4], w8
-; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    mov v0.b[5], w8
-; CHECK-NEXT:    umov w8, v1.h[2]
-; CHECK-NEXT:    mov v0.b[6], w8
-; CHECK-NEXT:    umov w8, v1.h[3]
-; CHECK-NEXT:    xtn v1.4h, v2.4s
-; CHECK-NEXT:    fcvtzs v2.4s, v3.4s
-; CHECK-NEXT:    mov v0.b[7], w8
-; CHECK-NEXT:    umov w8, v1.h[0]
-; CHECK-NEXT:    smin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    mov v0.b[8], w8
-; CHECK-NEXT:    umov w8, v1.h[1]
+; CHECK-NEXT:    smin v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    smax v3.4s, v3.4s, v5.4s
 ; CHECK-NEXT:    smax v2.4s, v2.4s, v5.4s
-; CHECK-NEXT:    mov v0.b[9], w8
-; CHECK-NEXT:    umov w8, v1.h[2]
-; CHECK-NEXT:    mov v0.b[10], w8
-; CHECK-NEXT:    umov w8, v1.h[3]
-; CHECK-NEXT:    xtn v1.4h, v2.4s
-; CHECK-NEXT:    mov v0.b[11], w8
-; CHECK-NEXT:    umov w8, v1.h[0]
-; CHECK-NEXT:    mov v0.b[12], w8
-; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    mov v0.b[13], w8
-; CHECK-NEXT:    umov w8, v1.h[2]
-; CHECK-NEXT:    mov v0.b[14], w8
-; CHECK-NEXT:    umov w8, v1.h[3]
-; CHECK-NEXT:    mov v0.b[15], w8
+; CHECK-NEXT:    smax v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v5.4s
+; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
     %x = call <16 x i8> @llvm.fptosi.sat.v16f32.v16i8(<16 x float> %f)
     ret <16 x i8> %x
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -2515,50 +2515,17 @@
 ; CHECK-LABEL: test_unsigned_v16f32_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v4.2d, #0x0000ff000000ff
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v3.4s, v3.4s
 ; CHECK-NEXT:    fcvtzu v2.4s, v2.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    umin v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    umin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    xtn v5.4h, v0.4s
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umov w8, v5.h[0]
-; CHECK-NEXT:    umov w9, v5.h[1]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    umov w8, v5.h[2]
-; CHECK-NEXT:    mov v0.b[1], w9
-; CHECK-NEXT:    mov v0.b[2], w8
-; CHECK-NEXT:    umov w8, v5.h[3]
-; CHECK-NEXT:    mov v0.b[3], w8
-; CHECK-NEXT:    umov w8, v1.h[0]
-; CHECK-NEXT:    mov v0.b[4], w8
-; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    mov v0.b[5], w8
-; CHECK-NEXT:    umov w8, v1.h[2]
-; CHECK-NEXT:    mov v0.b[6], w8
-; CHECK-NEXT:    umov w8, v1.h[3]
-; CHECK-NEXT:    xtn v1.4h, v2.4s
-; CHECK-NEXT:    fcvtzu v2.4s, v3.4s
-; CHECK-NEXT:    mov v0.b[7], w8
-; CHECK-NEXT:    umov w8, v1.h[0]
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    umin v3.4s, v3.4s, v4.4s
 ; CHECK-NEXT:    umin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    mov v0.b[8], w8
-; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    mov v0.b[9], w8
-; CHECK-NEXT:    umov w8, v1.h[2]
-; CHECK-NEXT:    mov v0.b[10], w8
-; CHECK-NEXT:    umov w8, v1.h[3]
-; CHECK-NEXT:    xtn v1.4h, v2.4s
-; CHECK-NEXT:    mov v0.b[11], w8
-; CHECK-NEXT:    umov w8, v1.h[0]
-; CHECK-NEXT:    mov v0.b[12], w8
-; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    mov v0.b[13], w8
-; CHECK-NEXT:    umov w8, v1.h[2]
-; CHECK-NEXT:    mov v0.b[14], w8
-; CHECK-NEXT:    umov w8, v1.h[3]
-; CHECK-NEXT:    mov v0.b[15], w8
+; CHECK-NEXT:    umin v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
     %x = call <16 x i8> @llvm.fptoui.sat.v16f32.v16i8(<16 x float> %f)
     ret <16 x i8> %x
diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
--- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
@@ -84,43 +84,13 @@
 define <16 x i8> @extract_4_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: extract_4_v4i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    umov w10, v0.h[1]
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    umov w8, v2.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    umov w9, v0.h[2]
-; CHECK-NEXT:    mov v4.b[1], w10
-; CHECK-NEXT:    umov w10, v0.h[3]
-; CHECK-NEXT:    mov v4.b[2], w9
-; CHECK-NEXT:    umov w9, v1.h[0]
-; CHECK-NEXT:    mov v4.b[3], w10
-; CHECK-NEXT:    umov w10, v1.h[1]
-; CHECK-NEXT:    mov v4.b[4], w9
-; CHECK-NEXT:    umov w9, v1.h[2]
-; CHECK-NEXT:    mov v4.b[5], w10
-; CHECK-NEXT:    umov w10, v1.h[3]
-; CHECK-NEXT:    mov v4.b[6], w9
-; CHECK-NEXT:    umov w9, v2.h[1]
-; CHECK-NEXT:    mov v4.b[7], w10
-; CHECK-NEXT:    mov v4.b[8], w8
-; CHECK-NEXT:    umov w8, v2.h[2]
-; CHECK-NEXT:    mov v4.b[9], w9
-; CHECK-NEXT:    umov w9, v2.h[3]
-; CHECK-NEXT:    mov v4.b[10], w8
-; CHECK-NEXT:    umov w8, v3.h[0]
-; CHECK-NEXT:    mov v4.b[11], w9
-; CHECK-NEXT:    umov w9, v3.h[1]
-; CHECK-NEXT:    mov v4.b[12], w8
-; CHECK-NEXT:    umov w8, v3.h[2]
-; CHECK-NEXT:    mov v4.b[13], w9
-; CHECK-NEXT:    umov w9, v3.h[3]
-; CHECK-NEXT:    mov v4.b[14], w8
-; CHECK-NEXT:    mov v4.b[15], w9
-; CHECK-NEXT:    mov v0.16b, v4.16b
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %a0 = extractelement <4 x i16> %a, i32 0
@@ -177,36 +147,9 @@
 define <16 x i8> @extract_4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; CHECK-LABEL: extract_4_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    mov w9, v0.s[2]
-; CHECK-NEXT:    mov w10, v0.s[3]
-; CHECK-NEXT:    mov v0.b[1], w8
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov v0.b[2], w9
-; CHECK-NEXT:    mov w9, v1.s[1]
-; CHECK-NEXT:    mov v0.b[3], w10
-; CHECK-NEXT:    mov v0.b[4], w8
-; CHECK-NEXT:    mov w8, v1.s[2]
-; CHECK-NEXT:    mov v0.b[5], w9
-; CHECK-NEXT:    mov w9, v1.s[3]
-; CHECK-NEXT:    mov v0.b[6], w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v0.b[7], w9
-; CHECK-NEXT:    mov w9, v2.s[1]
-; CHECK-NEXT:    mov v0.b[8], w8
-; CHECK-NEXT:    mov w8, v2.s[2]
-; CHECK-NEXT:    mov v0.b[9], w9
-; CHECK-NEXT:    mov w9, v2.s[3]
-; CHECK-NEXT:    mov v0.b[10], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov v0.b[11], w9
-; CHECK-NEXT:    mov w9, v3.s[1]
-; CHECK-NEXT:    mov v0.b[12], w8
-; CHECK-NEXT:    mov w8, v3.s[2]
-; CHECK-NEXT:    mov v0.b[13], w9
-; CHECK-NEXT:    mov w9, v3.s[3]
-; CHECK-NEXT:    mov v0.b[14], w8
-; CHECK-NEXT:    mov v0.b[15], w9
+; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %a0 = extractelement <4 x i32> %a, i32 0
@@ -263,41 +206,12 @@
 define <16 x i8> @extract_4_mixed(<4 x i16> %a, <4 x i32> %b, <4 x i32> %c, <4 x i16> %d) {
 ; CHECK-LABEL: extract_4_mixed:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.h[0]
-; CHECK-NEXT:    umov w9, v0.h[1]
+; CHECK-NEXT:    xtn v2.4h, v2.4s
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    umov w8, v0.h[2]
-; CHECK-NEXT:    mov v4.b[1], w9
-; CHECK-NEXT:    umov w9, v0.h[3]
-; CHECK-NEXT:    mov v4.b[2], w8
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov v4.b[3], w9
-; CHECK-NEXT:    mov w9, v1.s[1]
-; CHECK-NEXT:    mov v4.b[4], w8
-; CHECK-NEXT:    mov w8, v1.s[2]
-; CHECK-NEXT:    mov v4.b[5], w9
-; CHECK-NEXT:    mov w9, v1.s[3]
-; CHECK-NEXT:    mov v4.b[6], w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov v4.b[7], w9
-; CHECK-NEXT:    mov w9, v2.s[1]
-; CHECK-NEXT:    mov v4.b[8], w8
-; CHECK-NEXT:    mov w8, v2.s[2]
-; CHECK-NEXT:    mov v4.b[9], w9
-; CHECK-NEXT:    mov w9, v2.s[3]
-; CHECK-NEXT:    mov v4.b[10], w8
-; CHECK-NEXT:    umov w8, v3.h[0]
-; CHECK-NEXT:    mov v4.b[11], w9
-; CHECK-NEXT:    umov w9, v3.h[1]
-; CHECK-NEXT:    mov v4.b[12], w8
-; CHECK-NEXT:    umov w8, v3.h[2]
-; CHECK-NEXT:    mov v4.b[13], w9
-; CHECK-NEXT:    umov w9, v3.h[3]
-; CHECK-NEXT:    mov v4.b[14], w8
-; CHECK-NEXT:    mov v4.b[15], w9
-; CHECK-NEXT:    mov v0.16b, v4.16b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    xtn2 v0.8h, v1.4s
+; CHECK-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %a0 = extractelement <4 x i16> %a, i32 0
@@ -440,25 +354,8 @@
 define <16 x i8> @extract_4_v4i32_one(<4 x i32> %a) {
 ; CHECK-LABEL: extract_4_v4i32_one:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov w10, v0.s[2]
-; CHECK-NEXT:    mov w11, v0.s[3]
-; CHECK-NEXT:    mov v0.b[1], w8
-; CHECK-NEXT:    mov v0.b[2], w10
-; CHECK-NEXT:    mov v0.b[3], w11
-; CHECK-NEXT:    mov v0.b[4], w9
-; CHECK-NEXT:    mov v0.b[5], w8
-; CHECK-NEXT:    mov v0.b[6], w10
-; CHECK-NEXT:    mov v0.b[7], w11
-; CHECK-NEXT:    mov v0.b[8], w9
-; CHECK-NEXT:    mov v0.b[9], w8
-; CHECK-NEXT:    mov v0.b[10], w10
-; CHECK-NEXT:    mov v0.b[11], w11
-; CHECK-NEXT:    mov v0.b[12], w9
-; CHECK-NEXT:    mov v0.b[13], w8
-; CHECK-NEXT:    mov v0.b[14], w10
-; CHECK-NEXT:    mov v0.b[15], w11
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    ret
 entry:
   %a0 = extractelement <4 x i32> %a, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s
+
+; Subtargets must wait for outstanding memory instructions before a barrier if
+; they cannot back off of the barrier.
+
+define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 {
+; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX9-NO-BACKOFF:       ; %bb.0:
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_barrier
+; GFX9-NO-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX9-BACKOFF:       ; %bb.0:
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX9-BACKOFF-NEXT:    s_barrier
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX10-BACKOFF:       ; %bb.0:
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX10-BACKOFF-NEXT:    s_barrier
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+  %load = load i32, i32* %in
+  call void @llvm.amdgcn.s.barrier()
+  store i32 %load, i32* %out
+  ret void
+}
+
+define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 {
+; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX9-NO-BACKOFF:       ; %bb.0:
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_barrier
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX9-BACKOFF:       ; %bb.0:
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX9-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    s_barrier
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX10-BACKOFF:       ; %bb.0:
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    s_barrier
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    buffer_gl0_inv
+; GFX10-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+  %load = load i32, i32* %in
+  fence syncscope("workgroup") release
+  call void @llvm.amdgcn.s.barrier()
+  fence syncscope("workgroup") acquire
+  store i32 %load, i32* %out
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier()
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
@@ -0,0 +1,199 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra=0 < %s | FileCheck -check-prefix=GCN %s
+
+; This test is to make sure the return address registers, if clobbered in the
+; function or the function has calls, are save/restored when IPRA is enabled/disabled.
+
+; TODO: An artificial test with high register pressure would be more reliable in the
+; long run as branches on constants could be fragile.
+
+%struct.ShaderData = type { <3 x float>, <3 x float>, <3 x float>, <3 x float>, i32, i32, i32, i32, i32, float, float, i32, i32, float, float, %struct.differential3, %struct.differential3, %struct.differential, %struct.differential, <3 x float>, <3 x float>, <3 x float>, %struct.differential3, i32, i32, i32, float, <3 x float>, <3 x float>, <3 x float>, [1 x %struct.ShaderClosure] }
+%struct.differential = type { float, float }
+%struct.differential3 = type { <3 x float>, <3 x float> }
+%struct.ShaderClosure = type { <3 x float>, i32, float, <3 x float>, [10 x float], [8 x i8] }
+%struct.MicrofacetExtra = type { <3 x float>, <3 x float>, <3 x float>, float, [12 x i8] }
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #0
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture) #1
+
+; Function Attrs: norecurse
+define internal fastcc void @svm_node_closure_bsdf(%struct.ShaderData addrspace(1)* %sd, float* %stack, <4 x i32> %node, i32* %offset, i32 %0, i8 %trunc, float %1, float %2, float %mul80, i1 %cmp412.old, <4 x i32> %3, float %4, i32 %5, i1 %cmp440, i1 %cmp442, i1 %or.cond1306, float %.op, %struct.ShaderClosure addrspace(1)* %arrayidx.i.i2202, %struct.ShaderClosure addrspace(1)* %retval.0.i.i22089, %struct.ShaderClosure addrspace(1)* %retval.1.i221310, i1 %cmp575, i32 addrspace(1)* %num_closure_left.i2215, i32 %6, i1 %cmp.i2216, i32 %7, i64 %idx.ext.i2223, i32 %sub5.i2221) #2 {
+; GCN-LABEL: {{^}}svm_node_closure_bsdf:
+; GCN-NOT: s30,
+; GCN-NOT: s31,
+; GCN: s_waitcnt vmcnt(0)
+; GCN: s_setpc_b64 s[30:31]
+; GCN: .size   svm_node_closure_bsdf
+entry:
+  %8 = extractelement <4 x i32> %node, i64 0
+  %cmp.i.not = icmp eq i32 undef, 0
+  br i1 undef, label %common.ret.critedge, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %9 = load float, float* null, align 4
+  %phi.cmp = fcmp oeq float %9, 0.000000e+00
+  br i1 %phi.cmp, label %common.ret, label %cond.true20
+
+cond.true20:                                      ; preds = %cond.true
+  %trunc1 = trunc i32 %0 to i8
+  switch i8 %trunc, label %common.ret [
+    i8 44, label %sw.bb
+    i8 0, label %if.end.i.i2285
+  ]
+
+sw.bb:                                            ; preds = %cond.true20
+  %10 = load float, float* null, align 4
+  %11 = load float, float* null, align 4
+  %12 = tail call float @llvm.amdgcn.fmed3.f32(float %1, float 0.000000e+00, float 0.000000e+00)
+  %mul802 = fmul nsz float %1, 0.000000e+00
+  %cmp412.old3 = fcmp nsz ogt float %1, 0.000000e+00
+  br i1 %cmp412.old, label %if.then413, label %common.ret
+
+if.then413:                                       ; preds = %sw.bb
+  %13 = load <4 x i32>, <4 x i32> addrspace(1)* null, align 16
+  %14 = extractelement <4 x i32> %node, i64 0
+  %cmp4404 = fcmp nsz ole float %1, 0.000000e+00
+  %cmp4425 = icmp eq i32 %0, 0
+  %or.cond13066 = select i1 %cmp412.old, i1 false, i1 %cmp412.old
+  br i1 %or.cond1306, label %if.then443, label %if.else568
+
+if.then443:                                       ; preds = %if.then413
+  br i1 true, label %if.end511, label %common.ret
+
+common.ret.critedge:                              ; preds = %entry
+  store i32 0, i32* null, align 4
+  br label %common.ret
+
+common.ret:                                       ; preds = %if.end.i.i2285, %if.end627.sink.split, %cond.end579, %bsdf_alloc.exit2188, %if.end511, %common.ret.critedge, %if.then443, %sw.bb, %cond.true20, %cond.true
+  ret void
+
+if.end511:                                        ; preds = %if.then443
+  br i1 false, label %common.ret, label %if.then519
+
+if.then519:                                       ; preds = %if.end511
+  br i1 false, label %bsdf_alloc.exit2188, label %if.then.i2172
+
+if.then.i2172:                                    ; preds = %if.then519
+  br i1 false, label %closure_alloc.exit.i2184, label %if.end.i.i2181
+
+if.end.i.i2181:                                   ; preds = %if.then.i2172
+  br label %closure_alloc.exit.i2184
+
+closure_alloc.exit.i2184:                         ; preds = %if.end.i.i2181, %if.then.i2172
+  br i1 false, label %bsdf_alloc.exit2188, label %if.end.i2186
+
+if.end.i2186:                                     ; preds = %closure_alloc.exit.i2184
+  br label %bsdf_alloc.exit2188
+
+bsdf_alloc.exit2188:                              ; preds = %if.end.i2186, %closure_alloc.exit.i2184, %if.then519
+  br i1 false, label %common.ret, label %if.then534
+
+if.then534:                                       ; preds = %bsdf_alloc.exit2188
+  %.op7 = fmul nsz float undef, 0.000000e+00
+  %mul558 = select i1 %cmp440, float 0.000000e+00, float %1
+  %15 = tail call float @llvm.amdgcn.fmed3.f32(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00)
+  store float %mul558, float addrspace(1)* null, align 4
+  br label %if.end627.sink.split
+
+if.else568:                                       ; preds = %if.then413
+  br i1 undef, label %bsdf_alloc.exit2214, label %if.then.i2198
+
+if.then.i2198:                                    ; preds = %if.else568
+  br i1 undef, label %closure_alloc.exit.i2210, label %if.end.i.i2207
+
+if.end.i.i2207:                                   ; preds = %if.then.i2198
+  %arrayidx.i.i22028 = getelementptr inbounds %struct.ShaderData, %struct.ShaderData addrspace(1)* %sd, i64 0, i32 30, i64 undef
+  br label %closure_alloc.exit.i2210
+
+closure_alloc.exit.i2210:                         ; preds = %if.end.i.i2207, %if.then.i2198
+  %retval.0.i.i220899 = phi %struct.ShaderClosure addrspace(1)* [ %arrayidx.i.i2202, %if.end.i.i2207 ], [ null, %if.then.i2198 ]
+  br i1 false, label %bsdf_alloc.exit2214, label %if.end.i2212
+
+if.end.i2212:                                     ; preds = %closure_alloc.exit.i2210
+  br label %bsdf_alloc.exit2214
+
+bsdf_alloc.exit2214:                              ; preds = %if.end.i2212, %closure_alloc.exit.i2210, %if.else568
+  %retval.1.i22131010 = phi %struct.ShaderClosure addrspace(1)* [ %arrayidx.i.i2202, %if.end.i2212 ], [ null, %closure_alloc.exit.i2210 ], [ null, %if.else568 ]
+  %cmp57511 = icmp ne %struct.ShaderClosure addrspace(1)* %arrayidx.i.i2202, null
+  br i1 %cmp442, label %cond.true576, label %cond.end579
+
+cond.true576:                                     ; preds = %bsdf_alloc.exit2214
+  %num_closure_left.i221512 = getelementptr inbounds %struct.ShaderData, %struct.ShaderData addrspace(1)* %sd, i64 0, i32 25
+  %16 = load i32, i32 addrspace(1)* %num_closure_left.i2215, align 8
+  %cmp.i221613 = icmp slt i32 %0, 0
+  br i1 %cmp440, label %cond.end579, label %if.end.i2227
+
+if.end.i2227:                                     ; preds = %cond.true576
+  %sub5.i222114 = add nuw nsw i32 %0, 0
+  %17 = load i32, i32 addrspace(1)* null, align 4294967296
+  %idx.ext.i222315 = sext i32 %0 to i64
+  %add.ptr.i2224 = getelementptr inbounds %struct.ShaderData, %struct.ShaderData addrspace(1)* %sd, i64 0, i32 30, i64 %idx.ext.i2223
+  %idx.ext8.i22252724 = zext i32 %0 to i64
+  %add.ptr9.i2226 = getelementptr inbounds %struct.ShaderClosure, %struct.ShaderClosure addrspace(1)* %add.ptr.i2224, i64 %idx.ext8.i22252724
+  %phi.cast2731 = bitcast %struct.ShaderClosure addrspace(1)* %add.ptr9.i2226 to %struct.MicrofacetExtra addrspace(1)*
+  br label %cond.end579
+
+cond.end579:                                      ; preds = %if.end.i2227, %cond.true576, %bsdf_alloc.exit2214
+  %cond580 = phi %struct.MicrofacetExtra addrspace(1)* [ null, %bsdf_alloc.exit2214 ], [ %phi.cast2731, %if.end.i2227 ], [ null, %cond.true576 ]
+  %tobool583 = icmp ne %struct.MicrofacetExtra addrspace(1)* %cond580, null
+  %or.cond1308 = select i1 %cmp442, i1 %tobool583, i1 false
+  br i1 %or.cond1308, label %if.then584, label %common.ret
+
+if.then584:                                       ; preds = %cond.end579
+  store %struct.MicrofacetExtra addrspace(1)* null, %struct.MicrofacetExtra addrspace(1)* addrspace(1)* null, align 4294967296
+  br label %if.end627.sink.split
+
+if.end627.sink.split:                             ; preds = %if.then584, %if.then534
+  store i32 0, i32 addrspace(1)* null, align 4
+  br label %common.ret
+
+if.end.i.i2285:                                   ; preds = %cond.true20
+  store i32 0, i32 addrspace(1)* null, align 4294967296
+  br label %common.ret
+}
+
+define internal fastcc void @svm_eval_nodes(%struct.ShaderData addrspace(1)* %sd) {
+sw.bb10:
+; GCN-LABEL: {{^}}svm_eval_nodes:
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR:v[0-9]+]], s30,
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
+; GCN: s_swappc_b64 s[30:31]
+; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]],
+; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]],
+; GCN: s_waitcnt vmcnt(0)
+; GCN: s_setpc_b64 s[4:5]
+  call fastcc void @svm_node_closure_bsdf(%struct.ShaderData addrspace(1)* null, float* null, <4 x i32> zeroinitializer, i32* null, i32 undef, i8 undef, float undef, float undef, float undef, i1 undef, <4 x i32> undef, float undef, i32 undef, i1 undef, i1 undef, i1 undef, float undef, %struct.ShaderClosure addrspace(1)* undef, %struct.ShaderClosure addrspace(1)* undef, %struct.ShaderClosure addrspace(1)* undef, i1 undef, i32 addrspace(1)* undef, i32 undef, i1 undef, i32 undef, i64 undef, i32 undef)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_ocl_path_trace_shadow_blocked_dl() {
+kernel_set_buffer_pointers.exit:
+; GCN-LABEL: {{^}}kernel_ocl_path_trace_shadow_blocked_dl:
+; GCN: s_swappc_b64 s[30:31]
+; GCN: endpgm
+  tail call fastcc void @svm_eval_nodes(%struct.ShaderData addrspace(1)* null)
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.fabs.f32(float) #0
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.maxnum.f32(float, float) #0
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #3
+
+attributes #0 = { nofree nosync nounwind readnone speculatable willreturn }
+attributes #1 = { argmemonly nofree nosync nounwind willreturn }
+attributes #2 = { norecurse }
+attributes #3 = { nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
@@ -35,7 +35,7 @@
     ; GFX10: S_WAITCNT 0
     ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
-    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
     ; GFX10: S_BARRIER
     ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX10: S_WAITCNT 112
@@ -112,7 +112,7 @@
     ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX10: S_WAITCNT 0
-    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
     ; GFX10: S_BARRIER
     ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX10: S_WAITCNT 112
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx802  -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900  -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
 
 ; GCN-LABEL: barrier_vmcnt_global:
 ; GFX8:         flat_load_dword
@@ -42,7 +42,7 @@
   %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
   store i32 0, i32 addrspace(1)* %tmp5, align 4
   fence syncscope("singlethread") release
-  tail call void @llvm.amdgcn.s.barrier() #3
+  tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("singlethread") acquire
   %tmp6 = add nuw nsw i64 %tmp2, 4294967296
   %tmp7 = lshr exact i64 %tmp6, 32
@@ -116,7 +116,7 @@
   %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
   store i32 0, i32* %tmp5, align 4
   fence syncscope("singlethread") release
-  tail call void @llvm.amdgcn.s.barrier() #3
+  tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("singlethread") acquire
   %tmp6 = add nuw nsw i64 %tmp2, 4294967296
   %tmp7 = lshr exact i64 %tmp6, 32
diff --git a/llvm/test/CodeGen/LoongArch/1ri.mir b/llvm/test/CodeGen/LoongArch/1ri.mir
--- a/llvm/test/CodeGen/LoongArch/1ri.mir
+++ b/llvm/test/CodeGen/LoongArch/1ri.mir
@@ -80,17 +80,17 @@
 ---
 # CHECK-LABEL: test_BEQZ:
 # CHECK-ENC: 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0
-# CHECK-ASM: beqz	$a0, 23
+# CHECK-ASM: beqz	$a0, 92
 name: test_BEQZ
 body: |
   bb.0:
-    BEQZ $r4, 23
+    BEQZ $r4, 92
 ...
 ---
 # CHECK-LABEL: test_BNEZ:
 # CHECK-ENC: 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0
-# CHECK-ASM: bnez	$a0, 21
+# CHECK-ASM: bnez	$a0, 84
 name: test_BNEZ
 body: |
   bb.0:
-    BNEZ $r4, 21
+    BNEZ $r4, 84
diff --git a/llvm/test/CodeGen/LoongArch/2ri.mir b/llvm/test/CodeGen/LoongArch/2ri.mir
--- a/llvm/test/CodeGen/LoongArch/2ri.mir
+++ b/llvm/test/CodeGen/LoongArch/2ri.mir
@@ -280,74 +280,74 @@
 ---
 # CHECK-LABEL: test_LDPTR_W:
 # CHECK-ENC: 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldptr.w	$a0, $a1, 66
+# CHECK-ASM: ldptr.w	$a0, $a1, 264
 name: test_LDPTR_W
 body: |
   bb.0:
-    $r4 = LDPTR_W $r5, 66
+    $r4 = LDPTR_W $r5, 264
 ...
 ---
 # CHECK-LABEL: test_LDPTR_D:
 # CHECK-ENC: 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldptr.d	$a0, $a1, 56
+# CHECK-ASM: ldptr.d	$a0, $a1, 224
 name: test_LDPTR_D
 body: |
   bb.0:
-    $r4 = LDPTR_D $r5, 56
+    $r4 = LDPTR_D $r5, 224
 ...
 ---
 # CHECK-LABEL: test_STPTR_W:
 # CHECK-ENC: 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stptr.w	$a0, $a1, 87
+# CHECK-ASM: stptr.w	$a0, $a1, 348
 name: test_STPTR_W
 body: |
   bb.0:
-    STPTR_W $r4, $r5, 87
+    STPTR_W $r4, $r5, 348
 ...
 ---
 # CHECK-LABEL: test_STPTR_D:
 # CHECK-ENC: 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stptr.d	$a0, $a1, 145
+# CHECK-ASM: stptr.d	$a0, $a1, 580
 name: test_STPTR_D
 body: |
   bb.0:
-    STPTR_D $r4, $r5, 145
+    STPTR_D $r4, $r5, 580
 ...
 ---
 # CHECK-LABEL: test_LL_W:
 # CHECK-ENC: 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ll.w	$a0, $a1, 243
+# CHECK-ASM: ll.w	$a0, $a1, 972
 name: test_LL_W
 body: |
   bb.0:
-    $r4 = LL_W $r5, 243
+    $r4 = LL_W $r5, 972
 ...
 ---
 # CHECK-LABEL: test_LL_D:
 # CHECK-ENC: 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ll.d	$a0, $a1, 74
+# CHECK-ASM: ll.d	$a0, $a1, 296
 name: test_LL_D
 body: |
   bb.0:
-    $r4 = LL_D $r5, 74
+    $r4 = LL_D $r5, 296
 ...
 ---
 # CHECK-LABEL: test_SC_W:
 # CHECK-ENC: 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sc.w	$a0, $a1, 96
+# CHECK-ASM: sc.w	$a0, $a1, 384
 name: test_SC_W
 body: |
   bb.0:
-    $r4 = SC_W $r4, $r5, 96
+    $r4 = SC_W $r4, $r5, 384
 ...
 ---
 # CHECK-LABEL: test_SC_D:
 # CHECK-ENC: 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sc.d	$a0, $a1, 105
+# CHECK-ASM: sc.d	$a0, $a1, 420
 name: test_SC_D
 body: |
   bb.0:
-    $r4 = SC_D $r4, $r5, 105
+    $r4 = SC_D $r4, $r5, 420
 ...
 
 # -------------------------------------------------------------------------------------------------
@@ -371,62 +371,62 @@
 ---
 # CHECK-LABEL: test_JIRL:
 # CHECK-ENC: 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: jirl	$a0, $a1, 49
+# CHECK-ASM: jirl	$a0, $a1, 196
 name: test_JIRL
 body: |
   bb.0:
-    $r4 = JIRL $r5, 49
+    $r4 = JIRL $r5, 196
 ...
 ---
 # CHECK-LABEL: test_BEQ:
 # CHECK-ENC: 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: beq	$a0, $a1, 196
+# CHECK-ASM: beq	$a0, $a1, 784
 name: test_BEQ
 body: |
   bb.0:
-    BEQ $r4, $r5, 196
+    BEQ $r4, $r5, 784
 ...
 ---
 # CHECK-LABEL: test_BNE:
 # CHECK-ENC: 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: bne	$a0, $a1, 19
+# CHECK-ASM: bne	$a0, $a1, 76
 name: test_BNE
 body: |
   bb.0:
-    BNE $r4, $r5, 19
+    BNE $r4, $r5, 76
 ...
 ---
 # CHECK-LABEL: test_BLT:
 # CHECK-ENC: 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: blt	$a0, $a1, 123
+# CHECK-ASM: blt	$a0, $a1, 492
 name: test_BLT
 body: |
   bb.0:
-    BLT $r4, $r5, 123
+    BLT $r4, $r5, 492
 ...
 ---
 # CHECK-LABEL: test_BGE:
 # CHECK-ENC: 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: bge	$a0, $a1, 12
+# CHECK-ASM: bge	$a0, $a1, 48
 name: test_BGE
 body: |
   bb.0:
-    BGE $r4, $r5, 12
+    BGE $r4, $r5, 48
 ...
 ---
 # CHECK-LABEL: test_BLTU:
 # CHECK-ENC: 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: bltu	$a0, $a1, 17
+# CHECK-ASM: bltu	$a0, $a1, 68
 name: test_BLTU
 body: |
   bb.0:
-    BLTU $r4, $r5, 17
+    BLTU $r4, $r5, 68
 ...
 ---
 # CHECK-LABEL: test_BGEU:
 # CHECK-ENC: 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: bgeu	$a0, $a1, 88
+# CHECK-ASM: bgeu	$a0, $a1, 352
 name: test_BGEU
 body: |
   bb.0:
-    BGEU $r4, $r5, 88
+    BGEU $r4, $r5, 352
diff --git a/llvm/test/CodeGen/LoongArch/3ri.mir b/llvm/test/CodeGen/LoongArch/3ri.mir
--- a/llvm/test/CodeGen/LoongArch/3ri.mir
+++ b/llvm/test/CodeGen/LoongArch/3ri.mir
@@ -16,29 +16,29 @@
 ---
 # CHECK-LABEL: test_ALSL_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: alsl.w	$a0, $a1, $a2, 3
+# CHECK-ASM: alsl.w	$a0, $a1, $a2, 4
 name: test_ALSL_W
 body: |
   bb.0:
-    $r4 = ALSL_W $r5, $r6, 3
+    $r4 = ALSL_W $r5, $r6, 4
 ...
 ---
 # CHECK-LABEL: test_ALSL_WU:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: alsl.wu	$a0, $a1, $a2, 1
+# CHECK-ASM: alsl.wu	$a0, $a1, $a2, 2
 name: test_ALSL_WU
 body: |
   bb.0:
-    $r4 = ALSL_WU $r5, $r6, 1
+    $r4 = ALSL_WU $r5, $r6, 2
 ...
 ---
 # CHECK-LABEL: test_ALSL_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: alsl.d	$a0, $a1, $a2, 3
+# CHECK-ASM: alsl.d	$a0, $a1, $a2, 4
 name: test_ALSL_D
 body: |
   bb.0:
-    $r4 = ALSL_D $r5, $r6, 3
+    $r4 = ALSL_D $r5, $r6, 4
 ...
 ---
 # CHECK-LABEL: test_BYTEPICK_W:
diff --git a/llvm/test/CodeGen/LoongArch/misc.mir b/llvm/test/CodeGen/LoongArch/misc.mir
--- a/llvm/test/CodeGen/LoongArch/misc.mir
+++ b/llvm/test/CodeGen/LoongArch/misc.mir
@@ -62,20 +62,20 @@
 ---
 # CHECK-LABEL: test_B:
 # CHECK-ENC: 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
-# CHECK-ASM: b	20
+# CHECK-ASM: b	80
 name: test_B
 body: |
   bb.0:
-    B 20
+    B 80
 ...
 ---
 # CHECK-LABEL: test_BL:
 # CHECK-ENC: 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
-# CHECK-ASM: bl	34
+# CHECK-ASM: bl	136
 name: test_BL
 body: |
   bb.0:
-    BL 34
+    BL 136
 ...
 
 # --------------------------------------------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.ll b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.ll
--- a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.ll
+++ b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-math.ll
@@ -98,49 +98,104 @@
 
 declare float @llvm.ppc.fnmadds(float, float, float)
 
-define dso_local double @fnmsub_t0(double %d, double %d2, double %d3) {
-; CHECK-PWR8-LABEL: fnmsub_t0:
+define dso_local float @fnmsub_f32(float %f, float %f2, float %f3) {
+; CHECK-PWR8-LABEL: fnmsub_f32:
 ; CHECK-PWR8:       # %bb.0: # %entry
-; CHECK-PWR8-NEXT:    xsnmsubmdp 1, 2, 3
+; CHECK-PWR8-NEXT:    xsnmsubasp 3, 1, 2
+; CHECK-PWR8-NEXT:    fmr 1, 3
 ; CHECK-PWR8-NEXT:    blr
 ;
-; CHECK-NOVSX-LABEL: fnmsub_t0:
+; CHECK-NOVSX-LABEL: fnmsub_f32:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    fnmsubs 1, 1, 2, 3
+; CHECK-NOVSX-NEXT:    blr
+;
+; CHECK-PWR7-LABEL: fnmsub_f32:
+; CHECK-PWR7:       # %bb.0: # %entry
+; CHECK-PWR7-NEXT:    fnmsubs 1, 1, 2, 3
+; CHECK-PWR7-NEXT:    blr
+entry:
+  %0 = tail call float @llvm.ppc.fnmsub.f32(float %f, float %f2, float %f3)
+  ret float %0
+}
+
+declare float @llvm.ppc.fnmsub.f32(float, float, float)
+
+define dso_local double @fnmsub_f64(double %f, double %f2, double %f3) {
+; CHECK-PWR8-LABEL: fnmsub_f64:
+; CHECK-PWR8:       # %bb.0: # %entry
+; CHECK-PWR8-NEXT:    xsnmsubadp 3, 1, 2
+; CHECK-PWR8-NEXT:    fmr 1, 3
+; CHECK-PWR8-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: fnmsub_f64:
 ; CHECK-NOVSX:       # %bb.0: # %entry
 ; CHECK-NOVSX-NEXT:    fnmsub 1, 1, 2, 3
 ; CHECK-NOVSX-NEXT:    blr
 ;
-; CHECK-PWR7-LABEL: fnmsub_t0:
+; CHECK-PWR7-LABEL: fnmsub_f64:
 ; CHECK-PWR7:       # %bb.0: # %entry
-; CHECK-PWR7-NEXT:    xsnmsubmdp 1, 2, 3
+; CHECK-PWR7-NEXT:    xsnmsubadp 3, 1, 2
+; CHECK-PWR7-NEXT:    fmr 1, 3
 ; CHECK-PWR7-NEXT:    blr
 entry:
-  %0 = tail call double @llvm.ppc.fnmsub(double %d, double %d2, double %d3)
+  %0 = tail call double @llvm.ppc.fnmsub.f64(double %f, double %f2, double %f3)
   ret double %0
 }
 
-declare double @llvm.ppc.fnmsub(double, double, double)
+declare double @llvm.ppc.fnmsub.f64(double, double, double)
 
-define dso_local float @fnmsubs_t0(float %f, float %f2, float %f3) {
-; CHECK-PWR8-LABEL: fnmsubs_t0:
+define dso_local <4 x float> @fnmsub_v4f32(<4 x float> %f, <4 x float> %f2, <4 x float> %f3) {
+; CHECK-PWR8-LABEL: fnmsub_v4f32:
 ; CHECK-PWR8:       # %bb.0: # %entry
-; CHECK-PWR8-NEXT:    xsnmsubmsp 1, 2, 3
+; CHECK-PWR8-NEXT:    xvnmsubasp 36, 34, 35
+; CHECK-PWR8-NEXT:    vmr 2, 4
 ; CHECK-PWR8-NEXT:    blr
 ;
-; CHECK-NOVSX-LABEL: fnmsubs_t0:
+; CHECK-NOVSX-LABEL: fnmsub_v4f32:
 ; CHECK-NOVSX:       # %bb.0: # %entry
-; CHECK-NOVSX-NEXT:    fnmsubs 1, 1, 2, 3
+; CHECK-NOVSX-NEXT:    fnmsubs 1, 1, 5, 9
+; CHECK-NOVSX-NEXT:    fnmsubs 2, 2, 6, 10
+; CHECK-NOVSX-NEXT:    fnmsubs 3, 3, 7, 11
+; CHECK-NOVSX-NEXT:    fnmsubs 4, 4, 8, 12
 ; CHECK-NOVSX-NEXT:    blr
 ;
-; CHECK-PWR7-LABEL: fnmsubs_t0:
+; CHECK-PWR7-LABEL: fnmsub_v4f32:
 ; CHECK-PWR7:       # %bb.0: # %entry
-; CHECK-PWR7-NEXT:    fnmsubs 1, 1, 2, 3
+; CHECK-PWR7-NEXT:    xvnmsubasp 36, 34, 35
+; CHECK-PWR7-NEXT:    vmr 2, 4
 ; CHECK-PWR7-NEXT:    blr
 entry:
-  %0 = tail call float @llvm.ppc.fnmsubs(float %f, float %f2, float %f3)
-  ret float %0
+  %0 = tail call <4 x float> @llvm.ppc.fnmsub.v4f32(<4 x float> %f, <4 x float> %f2, <4 x float> %f3)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.ppc.fnmsub.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+define dso_local <2 x double> @fnmsub_v2f64(<2 x double> %f, <2 x double> %f2, <2 x double> %f3) {
+; CHECK-PWR8-LABEL: fnmsub_v2f64:
+; CHECK-PWR8:       # %bb.0: # %entry
+; CHECK-PWR8-NEXT:    xvnmsubadp 36, 34, 35
+; CHECK-PWR8-NEXT:    vmr 2, 4
+; CHECK-PWR8-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: fnmsub_v2f64:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    fnmsub 1, 1, 3, 5
+; CHECK-NOVSX-NEXT:    fnmsub 2, 2, 4, 6
+; CHECK-NOVSX-NEXT:    blr
+;
+; CHECK-PWR7-LABEL: fnmsub_v2f64:
+; CHECK-PWR7:       # %bb.0: # %entry
+; CHECK-PWR7-NEXT:    xvnmsubadp 36, 34, 35
+; CHECK-PWR7-NEXT:    vmr 2, 4
+; CHECK-PWR7-NEXT:    blr
+entry:
+  %0 = tail call <2 x double> @llvm.ppc.fnmsub.v2f64(<2 x double> %f, <2 x double> %f2, <2 x double> %f3)
+  ret <2 x double> %0
 }
 
-declare float @llvm.ppc.fnmsubs(float, float, float)
+declare <2 x double> @llvm.ppc.fnmsub.v2f64(<2 x double>, <2 x double>, <2 x double>)
 
 define dso_local double @fre(double %d) {
 ; CHECK-PWR8-LABEL: fre:
diff --git a/llvm/test/CodeGen/PowerPC/vec-itofp.ll b/llvm/test/CodeGen/PowerPC/vec-itofp.ll
--- a/llvm/test/CodeGen/PowerPC/vec-itofp.ll
+++ b/llvm/test/CodeGen/PowerPC/vec-itofp.ll
@@ -307,34 +307,33 @@
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    lxv v2, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI3_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v4, v4, v4
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI3_0@toc@l
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI3_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI3_1@toc@l
-; CHECK-BE-NEXT:    vperm v3, v4, v2, v3
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
 ; CHECK-BE-NEXT:    vextsh2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI3_2@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI3_2@toc@l
-; CHECK-BE-NEXT:    vperm v3, v4, v2, v3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    vextsh2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs1, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI3_3@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI3_3@toc@l
 ; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs1, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    vextsh2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs2, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs2, 0(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
 ; CHECK-BE-NEXT:    xvcvsxddp vs3, v2
-; CHECK-BE-NEXT:    stxv vs3, 32(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load <8 x i16>, <8 x i16>* %SrcPtr, align 16
@@ -395,20 +394,19 @@
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    lxv v2, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI4_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v3, v3, v3
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI4_0@toc@l
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI4_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI4_1@toc@l
-; CHECK-BE-NEXT:    vperm v3, v3, v2, v4
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
 ; CHECK-BE-NEXT:    vextsh2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
 ; CHECK-BE-NEXT:    xvcvsxddp vs1, v2
-; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load <4 x i16>, <4 x i16>* %SrcPtr, align 16
diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
@@ -459,20 +459,19 @@
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    mtvsrd v2, r4
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI5_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v3, v3, v3
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI5_0@toc@l
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI5_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI5_1@toc@l
-; CHECK-BE-NEXT:    vperm v3, v3, v2, v4
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
 ; CHECK-BE-NEXT:    vextsh2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
 ; CHECK-BE-NEXT:    xvcvsxddp vs1, v2
-; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = bitcast i64 %a.coerce to <4 x i16>
@@ -564,34 +563,33 @@
 ; CHECK-BE-LABEL: test8elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v4, v4, v4
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_0@toc@l
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_1@toc@l
-; CHECK-BE-NEXT:    vperm v3, v4, v2, v3
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
 ; CHECK-BE-NEXT:    vextsh2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_2@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_2@toc@l
-; CHECK-BE-NEXT:    vperm v3, v4, v2, v3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    vextsh2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs1, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_3@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_3@toc@l
 ; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs1, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    vextsh2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs2, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs2, 0(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
 ; CHECK-BE-NEXT:    xvcvsxddp vs3, v2
-; CHECK-BE-NEXT:    stxv vs3, 32(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = sitofp <8 x i16> %a to <8 x double>
@@ -730,52 +728,51 @@
 ; CHECK-BE-LABEL: test16elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    addis r5, r2, .LCPI7_0@toc@ha
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
-; CHECK-BE-NEXT:    lxv v1, 16(r4)
-; CHECK-BE-NEXT:    xxlxor v5, v5, v5
-; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_2@toc@ha
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
 ; CHECK-BE-NEXT:    addi r5, r5, .LCPI7_0@toc@l
-; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_2@toc@l
-; CHECK-BE-NEXT:    lxv v2, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 0(r5)
 ; CHECK-BE-NEXT:    addis r5, r2, .LCPI7_1@toc@ha
 ; CHECK-BE-NEXT:    addi r5, r5, .LCPI7_1@toc@l
-; CHECK-BE-NEXT:    lxv v3, 0(r5)
-; CHECK-BE-NEXT:    vperm v0, v5, v4, v2
-; CHECK-BE-NEXT:    vperm v2, v5, v1, v2
-; CHECK-BE-NEXT:    vextsh2d v2, v2
-; CHECK-BE-NEXT:    vextsh2d v0, v0
-; CHECK-BE-NEXT:    xvcvsxddp vs2, v2
-; CHECK-BE-NEXT:    vperm v2, v5, v1, v3
-; CHECK-BE-NEXT:    xvcvsxddp vs0, v0
-; CHECK-BE-NEXT:    vperm v0, v5, v4, v3
+; CHECK-BE-NEXT:    lxv v5, 0(r5)
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI7_2@toc@ha
+; CHECK-BE-NEXT:    vperm v4, v2, v2, v3
+; CHECK-BE-NEXT:    addi r5, r5, .LCPI7_2@toc@l
+; CHECK-BE-NEXT:    vextsh2d v4, v4
+; CHECK-BE-NEXT:    lxv v0, 0(r5)
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI7_3@toc@ha
+; CHECK-BE-NEXT:    xvcvsxddp vs0, v4
+; CHECK-BE-NEXT:    vperm v4, v2, v2, v5
+; CHECK-BE-NEXT:    addi r5, r5, .LCPI7_3@toc@l
+; CHECK-BE-NEXT:    lxv v1, 0(r5)
+; CHECK-BE-NEXT:    vextsh2d v4, v4
+; CHECK-BE-NEXT:    xvcvsxddp vs1, v4
+; CHECK-BE-NEXT:    vperm v4, v2, v2, v0
+; CHECK-BE-NEXT:    vperm v2, v2, v2, v1
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    vextsh2d v4, v4
+; CHECK-BE-NEXT:    xvcvsxddp vs2, v4
+; CHECK-BE-NEXT:    lxv v4, 16(r4)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
-; CHECK-BE-NEXT:    vextsh2d v0, v0
 ; CHECK-BE-NEXT:    xvcvsxddp vs3, v2
-; CHECK-BE-NEXT:    lxv v2, 0(r4)
-; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_3@toc@ha
-; CHECK-BE-NEXT:    xvcvsxddp vs1, v0
-; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_3@toc@l
-; CHECK-BE-NEXT:    stxv vs2, 80(r3)
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
-; CHECK-BE-NEXT:    vperm v3, v4, v4, v2
-; CHECK-BE-NEXT:    vperm v2, v1, v1, v2
-; CHECK-BE-NEXT:    stxv vs3, 112(r3)
-; CHECK-BE-NEXT:    stxv vs1, 48(r3)
-; CHECK-BE-NEXT:    vextsh2d v3, v3
+; CHECK-BE-NEXT:    vperm v2, v4, v4, v3
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    vextsh2d v2, v2
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
+; CHECK-BE-NEXT:    xvcvsxddp vs4, v2
+; CHECK-BE-NEXT:    vperm v2, v4, v4, v5
+; CHECK-BE-NEXT:    vextsh2d v2, v2
+; CHECK-BE-NEXT:    xvcvsxddp vs5, v2
+; CHECK-BE-NEXT:    vperm v2, v4, v4, v0
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
-; CHECK-BE-NEXT:    xvcvsxddp vs4, v3
-; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    xvcvsxddp vs6, v2
-; CHECK-BE-NEXT:    vperm v4, v4, v4, v3
-; CHECK-BE-NEXT:    vperm v2, v1, v1, v3
-; CHECK-BE-NEXT:    stxv vs6, 64(r3)
-; CHECK-BE-NEXT:    stxv vs4, 0(r3)
-; CHECK-BE-NEXT:    vextsh2d v4, v4
+; CHECK-BE-NEXT:    vperm v2, v4, v4, v1
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
-; CHECK-BE-NEXT:    xvcvsxddp vs5, v4
 ; CHECK-BE-NEXT:    xvcvsxddp vs7, v2
-; CHECK-BE-NEXT:    stxv vs7, 96(r3)
-; CHECK-BE-NEXT:    stxv vs5, 32(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %a = load <16 x i16>, <16 x i16>* %0, align 32
diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
@@ -404,20 +404,19 @@
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    mtvsrd v2, r4
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v3, v3, v3
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_0@toc@l
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_1@toc@l
-; CHECK-BE-NEXT:    vperm v3, v3, v2, v4
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
 ; CHECK-BE-NEXT:    vextsb2w v3, v3
 ; CHECK-BE-NEXT:    xvcvsxwsp vs0, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    vextsb2w v2, v2
 ; CHECK-BE-NEXT:    xvcvsxwsp vs1, v2
-; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = bitcast i64 %a.coerce to <8 x i8>
@@ -503,34 +502,33 @@
 ; CHECK-BE-LABEL: test16elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v4, v4, v4
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_0@toc@l
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_1@toc@l
-; CHECK-BE-NEXT:    vperm v3, v4, v2, v3
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
 ; CHECK-BE-NEXT:    vextsb2w v3, v3
 ; CHECK-BE-NEXT:    xvcvsxwsp vs0, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_2@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_2@toc@l
-; CHECK-BE-NEXT:    vperm v3, v4, v2, v3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    vextsb2w v3, v3
 ; CHECK-BE-NEXT:    xvcvsxwsp vs1, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_3@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_3@toc@l
 ; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs1, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    vextsb2w v3, v3
 ; CHECK-BE-NEXT:    xvcvsxwsp vs2, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs2, 0(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    vextsb2w v2, v2
 ; CHECK-BE-NEXT:    xvcvsxwsp vs3, v2
-; CHECK-BE-NEXT:    stxv vs3, 32(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = sitofp <16 x i8> %a to <16 x float>
diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
@@ -492,20 +492,19 @@
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    mtvsrwz v2, r4
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI5_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v3, v3, v3
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI5_0@toc@l
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI5_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI5_1@toc@l
-; CHECK-BE-NEXT:    vperm v3, v3, v2, v4
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
 ; CHECK-BE-NEXT:    vextsb2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    vextsb2d v2, v2
 ; CHECK-BE-NEXT:    xvcvsxddp vs1, v2
-; CHECK-BE-NEXT:    stxv vs1, 0(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = bitcast i32 %a.coerce to <4 x i8>
@@ -600,34 +599,33 @@
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    mtvsrd v2, r4
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v4, v4, v4
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_0@toc@l
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_1@toc@l
-; CHECK-BE-NEXT:    vperm v3, v4, v2, v3
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
 ; CHECK-BE-NEXT:    vextsb2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_2@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_2@toc@l
-; CHECK-BE-NEXT:    vperm v3, v4, v2, v3
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
 ; CHECK-BE-NEXT:    vextsb2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs1, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI6_3@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI6_3@toc@l
 ; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs1, 48(r3)
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
 ; CHECK-BE-NEXT:    vextsb2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs2, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs2, 0(r3)
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    vextsb2d v2, v2
 ; CHECK-BE-NEXT:    xvcvsxddp vs3, v2
-; CHECK-BE-NEXT:    stxv vs3, 32(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = bitcast i64 %a.coerce to <8 x i8>
@@ -787,62 +785,61 @@
 ; CHECK-BE-LABEL: test16elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_0@toc@ha
-; CHECK-BE-NEXT:    xxlxor v3, v3, v3
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_0@toc@l
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_1@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_1@toc@l
-; CHECK-BE-NEXT:    vperm v4, v3, v2, v4
-; CHECK-BE-NEXT:    vextsb2d v4, v4
-; CHECK-BE-NEXT:    xvcvsxddp vs0, v4
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    vextsb2d v3, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs0, v3
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_2@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_2@toc@l
-; CHECK-BE-NEXT:    vperm v4, v3, v2, v4
-; CHECK-BE-NEXT:    stxv vs0, 16(r3)
-; CHECK-BE-NEXT:    vextsb2d v4, v4
-; CHECK-BE-NEXT:    xvcvsxddp vs1, v4
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-NEXT:    vextsb2d v3, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs1, v3
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_3@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_3@toc@l
-; CHECK-BE-NEXT:    vperm v4, v3, v2, v4
-; CHECK-BE-NEXT:    stxv vs1, 48(r3)
-; CHECK-BE-NEXT:    vextsb2d v4, v4
-; CHECK-BE-NEXT:    xvcvsxddp vs2, v4
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    vextsb2d v3, v3
+; CHECK-BE-NEXT:    xvcvsxddp vs2, v3
+; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_4@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_4@toc@l
-; CHECK-BE-NEXT:    vperm v3, v3, v2, v4
-; CHECK-BE-NEXT:    stxv vs2, 80(r3)
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    vextsb2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs3, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_5@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_5@toc@l
 ; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs3, 112(r3)
+; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    vextsb2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs4, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_6@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_6@toc@l
 ; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs4, 0(r3)
+; CHECK-BE-NEXT:    stxv vs4, 64(r3)
 ; CHECK-BE-NEXT:    vextsb2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs5, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    addis r4, r2, .LCPI7_7@toc@ha
 ; CHECK-BE-NEXT:    addi r4, r4, .LCPI7_7@toc@l
 ; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs5, 32(r3)
+; CHECK-BE-NEXT:    stxv vs5, 80(r3)
 ; CHECK-BE-NEXT:    vextsb2d v3, v3
 ; CHECK-BE-NEXT:    xvcvsxddp vs6, v3
 ; CHECK-BE-NEXT:    lxv v3, 0(r4)
 ; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
-; CHECK-BE-NEXT:    stxv vs6, 64(r3)
+; CHECK-BE-NEXT:    stxv vs6, 96(r3)
 ; CHECK-BE-NEXT:    vextsb2d v2, v2
 ; CHECK-BE-NEXT:    xvcvsxddp vs7, v2
-; CHECK-BE-NEXT:    stxv vs7, 96(r3)
+; CHECK-BE-NEXT:    stxv vs7, 112(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = sitofp <16 x i8> %a to <16 x double>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgeu-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgeu-rv32.ll
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgeu-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgeu-rv32.ll
@@ -2097,9 +2097,8 @@
 define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vi_nxv2i16_i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vi_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmset.m v8
-; CHECK-NEXT:    vmand.mm v0, v9, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmor.mm v0, v9, v0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i16.i16(
@@ -2112,6 +2111,21 @@
   ret <vscale x 2 x i1> %a
 }
 
+define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vi_nxv2i16_i16_same_mask_maskedoff(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
+; CHECK-LABEL: intrinsic_vmsgeu_mask_vi_nxv2i16_i16_same_mask_maskedoff:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i16.i16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x i16> %1,
+    i16 0,
+    <vscale x 2 x i1> %0,
+    i32 %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
 define <vscale x 4 x i1> @intrinsic_vmsgeu_vi_nxv4i16_i16(<vscale x 4 x i16> %0, i32 %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_vi_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgeu-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgeu-rv64.ll
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgeu-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgeu-rv64.ll
@@ -2064,9 +2064,8 @@
 define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vi_nxv2i16_i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vi_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmset.m v8
-; CHECK-NEXT:    vmand.mm v0, v9, v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vmor.mm v0, v9, v0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i16.i16(
@@ -2094,6 +2093,21 @@
   ret <vscale x 4 x i1> %a
 }
 
+define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vi_nxv2i16_i16_same_mask_maskedoff(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vmsgeu_mask_vi_nxv2i16_i16_same_mask_maskedoff:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i16.i16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x i16> %1,
+    i16 0,
+    <vscale x 2 x i1> %0,
+    i64 %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
 define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vi_nxv4i16_i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vi_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -533,30 +533,27 @@
 define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) {
 ; CHECK-LABEL: vst3_v2i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    ldrb r2, [r0]
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    mov r4, sp
 ; CHECK-NEXT:    ldrb r3, [r0, #1]
 ; CHECK-NEXT:    ldrb.w r12, [r0, #2]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    ldrb.w lr, [r0, #3]
-; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    ldrb r5, [r0, #5]
-; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    vmov.16 q0[0], r2
 ; CHECK-NEXT:    ldrb r0, [r0, #4]
 ; CHECK-NEXT:    vmov.16 q0[1], r12
-; CHECK-NEXT:    mov r2, sp
 ; CHECK-NEXT:    vmov.16 q0[2], r0
 ; CHECK-NEXT:    add r0, sp, #8
 ; CHECK-NEXT:    vmov.16 q0[3], r3
 ; CHECK-NEXT:    vmov.16 q0[4], lr
 ; CHECK-NEXT:    vmov.16 q0[5], r5
-; CHECK-NEXT:    vmov.16 q0[6], r6
-; CHECK-NEXT:    vmov.16 q0[7], r6
-; CHECK-NEXT:    vstrb.16 q0, [r2]
+; CHECK-NEXT:    vstrb.16 q0, [r4]
 ; CHECK-NEXT:    vstrb.16 q0, [r0]
 ; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    ldr r2, [sp]
@@ -564,7 +561,7 @@
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    strh r0, [r1, #4]
 ; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
   %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
diff --git a/llvm/test/CodeGen/VE/Packed/vec_load.ll b/llvm/test/CodeGen/VE/Packed/vec_load.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vec_load.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
+
+declare <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %0, i32 immarg %1, <512 x i1> %2, <512 x float> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <512 x float> @vec_mload_v512f32(<512 x float>* %P, <512 x i1> %M) {
+; CHECK-LABEL: vec_mload_v512f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vldu %v0, 8, %s0
+; CHECK-NEXT:    lea %s0, 4(, %s0)
+; CHECK-NEXT:    vldu %v1, 8, %s0
+; CHECK-NEXT:    vshf %v0, %v1, %v0, 8
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %P, i32 16, <512 x i1> %M, <512 x float> undef)
+  ret <512 x float> %r
+}
+
+; TODO: Packed select legalization
+; Function Attrs: nounwind
+; define fastcc <512 x float> @vec_mload_pt_v512f32(<512 x float>* %P, <512 x float> %PT, <512 x i1> %M) {
+;   %r = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %P, i32 16, <512 x i1> %M, <512 x float> %PT)
+;   ret <512 x float> %r
+; }
+
+declare <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %0, i32 immarg %1, <512 x i1> %2, <512 x i32> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <512 x i32> @vec_mload_v512i32(<512 x i32>* %P, <512 x i1> %M) {
+; CHECK-LABEL: vec_mload_v512i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vldl.zx %v0, 8, %s0
+; CHECK-NEXT:    lea %s0, 4(, %s0)
+; CHECK-NEXT:    vldl.zx %v1, 8, %s0
+; CHECK-NEXT:    vshf %v0, %v1, %v0, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %P, i32 16, <512 x i1> %M, <512 x i32> undef)
+  ret <512 x i32> %r
+}
+
+; TODO: Packed select legalization
+; ; Function Attrs: nounwind
+; define fastcc <512 x i32> @vec_mload_pt_v512i32(<512 x i32>* %P, <512 x i32> %PT, <512 x i1> %M) {
+;   %r = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %P, i32 16, <512 x i1> %M, <512 x i32> %PT)
+;   ret <512 x i32> %r
+; }
+
+attributes #0 = { argmemonly nounwind readonly willreturn }
diff --git a/llvm/test/CodeGen/VE/Packed/vec_store.ll b/llvm/test/CodeGen/VE/Packed/vec_store.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vec_store.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
+
+declare void @llvm.masked.store.v512f32.p0v512f32(<512 x float>, <512 x float>*, i32 immarg, <512 x i1>)
+
+define fastcc void @vec_mstore_v512f32(<512 x float>* %P, <512 x float> %V, <512 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v512f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vstu %v0, 8, %s0
+; CHECK-NEXT:    vshf %v0, %v0, %v0, 4
+; CHECK-NEXT:    lea %s0, 4(, %s0)
+; CHECK-NEXT:    vstu %v0, 8, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  call void @llvm.masked.store.v512f32.p0v512f32(<512 x float> %V, <512 x float>* %P, i32 16, <512 x i1> %M)
+  ret void
+}
+
+
+declare void @llvm.masked.store.v512i32.p0v512i32(<512 x i32>, <512 x i32>*, i32 immarg, <512 x i1>)
+
+define fastcc void @vec_mstore_v512i32(<512 x i32>* %P, <512 x i32> %V, <512 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v512i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 4(, %s0)
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vstl %v0, 8, %s1
+; CHECK-NEXT:    vshf %v0, %v0, %v0, 0
+; CHECK-NEXT:    vstl %v0, 8, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  call void @llvm.masked.store.v512i32.p0v512i32(<512 x i32> %V, <512 x i32>* %P, i32 16, <512 x i1> %M)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -4232,63 +4232,65 @@
 
 declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8)
 
-define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
+define { <8 x i64>, <8 x i64>, <8 x i64> }@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpsrlq $4, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xd0,0x04]
-; X86-NEXT:    vpsrlq $5, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xd0,0x05]
-; X86-NEXT:    vpsrlq $6, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xd0,0x06]
-; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
-; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    vpsrlq $4, %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf1,0xe5,0x49,0x73,0xd0,0x04]
+; X86-NEXT:    vpsrlq $5, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x73,0xd0,0x05]
+; X86-NEXT:    vpsrlq $6, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xc9,0x73,0xd0,0x06]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
 ; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vpsrlq $4, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xd0,0x04]
-; X64-NEXT:    vpsrlq $5, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xd0,0x05]
-; X64-NEXT:    vpsrlq $6, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xd0,0x06]
-; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
-; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    vpsrlq $4, %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf1,0xe5,0x49,0x73,0xd0,0x04]
+; X64-NEXT:    vpsrlq $5, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x73,0xd0,0x05]
+; X64-NEXT:    vpsrlq $6, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xc9,0x73,0xd0,0x06]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X64-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 6, <8 x i64> zeroinitializer, i8 %x3)
-  %res3 = add <8 x i64> %res, %res1
-  %res4 = add <8 x i64> %res3, %res2
-  ret <8 x i64> %res4
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
+declare <16 x i32>@llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
 
-define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
+define { <16 x i32>, <16 x i32>, <16 x i32> }@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psrl_di_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vpsrld $4, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xd0,0x04]
-; X86-NEXT:    vpsrld $5, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0x72,0xd0,0x05]
-; X86-NEXT:    vpsrld $6, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xd0,0x06]
-; X86-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
-; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    vpsrld $4, %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf1,0x65,0x49,0x72,0xd0,0x04]
+; X86-NEXT:    vpsrld $5, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0x72,0xd0,0x05]
+; X86-NEXT:    vpsrld $6, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xc9,0x72,0xd0,0x06]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psrl_di_512:
 ; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vpsrld $4, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xd0,0x04]
-; X64-NEXT:    vpsrld $5, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0x72,0xd0,0x05]
-; X64-NEXT:    vpsrld $6, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xd0,0x06]
-; X64-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
-; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    vpsrld $4, %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf1,0x65,0x49,0x72,0xd0,0x04]
+; X64-NEXT:    vpsrld $5, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0x72,0xd0,0x05]
+; X64-NEXT:    vpsrld $6, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xc9,0x72,0xd0,0x06]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X64-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 6, <16 x i32> zeroinitializer, i16 %x3)
-  %res3 = add <16 x i32> %res, %res1
-  %res4 = add <16 x i32> %res3, %res2
-  ret <16 x i32> %res4
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
 }
 
 declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16)
diff --git a/llvm/test/CodeGen/X86/avx512-rotate.ll b/llvm/test/CodeGen/X86/avx512-rotate.ll
--- a/llvm/test/CodeGen/X86/avx512-rotate.ll
+++ b/llvm/test/CodeGen/X86/avx512-rotate.ll
@@ -12,230 +12,238 @@
 
 ; Tests showing replacement of variable rotates with immediate splat versions.
 
-define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
 ; KNL-LABEL: test_splat_rol_v16i32:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vprold $5, %zmm0, %zmm1 {%k1}
-; KNL-NEXT:    vprold $6, %zmm0, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vprold $7, %zmm0, %zmm0
-; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    vprold $5, %zmm0, %zmm3 {%k1}
+; KNL-NEXT:    vprold $6, %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vprold $7, %zmm0, %zmm2
+; KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_splat_rol_v16i32:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vprold $5, %zmm0, %zmm1 {%k1}
-; SKX-NEXT:    vprold $6, %zmm0, %zmm2 {%k1} {z}
-; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vprold $7, %zmm0, %zmm0
-; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    vprold $5, %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    vprold $6, %zmm0, %zmm1 {%k1} {z}
+; SKX-NEXT:    vprold $7, %zmm0, %zmm2
+; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; SKX-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>, <16 x i32> %x1, i16 -1)
-  %res3 = add <16 x i32> %res, %res1
-  %res4 = add <16 x i32> %res3, %res2
-  ret <16 x i32> %res4
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
 }
 
-define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
 ; KNL-LABEL: test_splat_rol_v8i64:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vprolq $5, %zmm0, %zmm1 {%k1}
-; KNL-NEXT:    vprolq $6, %zmm0, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vprolq $7, %zmm0, %zmm0
-; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    vprolq $5, %zmm0, %zmm3 {%k1}
+; KNL-NEXT:    vprolq $6, %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vprolq $7, %zmm0, %zmm2
+; KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_splat_rol_v8i64:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vprolq $5, %zmm0, %zmm1 {%k1}
-; SKX-NEXT:    vprolq $6, %zmm0, %zmm2 {%k1} {z}
-; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vprolq $7, %zmm0, %zmm0
-; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    vprolq $5, %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    vprolq $6, %zmm0, %zmm1 {%k1} {z}
+; SKX-NEXT:    vprolq $7, %zmm0, %zmm2
+; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; SKX-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2)
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6>, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>, <8 x i64> %x1, i8 -1)
-  %res3 = add <8 x i64> %res, %res1
-  %res4 = add <8 x i64> %res3, %res2
-  ret <8 x i64> %res4
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
 }
 
-define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
 ; KNL-LABEL: test_splat_ror_v16i32:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vprord $5, %zmm0, %zmm1 {%k1}
-; KNL-NEXT:    vprord $6, %zmm0, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vprord $7, %zmm0, %zmm0
-; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    vprord $5, %zmm0, %zmm3 {%k1}
+; KNL-NEXT:    vprord $6, %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vprord $7, %zmm0, %zmm2
+; KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_splat_ror_v16i32:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vprord $5, %zmm0, %zmm1 {%k1}
-; SKX-NEXT:    vprord $6, %zmm0, %zmm2 {%k1} {z}
-; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vprord $7, %zmm0, %zmm0
-; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    vprord $5, %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    vprord $6, %zmm0, %zmm1 {%k1} {z}
+; SKX-NEXT:    vprord $7, %zmm0, %zmm2
+; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; SKX-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>, <16 x i32> %x1, i16 -1)
-  %res3 = add <16 x i32> %res, %res1
-  %res4 = add <16 x i32> %res3, %res2
-  ret <16 x i32> %res4
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
 }
 
-define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
 ; KNL-LABEL: test_splat_ror_v8i64:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vprorq $5, %zmm0, %zmm1 {%k1}
-; KNL-NEXT:    vprorq $6, %zmm0, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vprorq $7, %zmm0, %zmm0
-; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    vprorq $5, %zmm0, %zmm3 {%k1}
+; KNL-NEXT:    vprorq $6, %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vprorq $7, %zmm0, %zmm2
+; KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_splat_ror_v8i64:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vprorq $5, %zmm0, %zmm1 {%k1}
-; SKX-NEXT:    vprorq $6, %zmm0, %zmm2 {%k1} {z}
-; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vprorq $7, %zmm0, %zmm0
-; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    vprorq $5, %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    vprorq $6, %zmm0, %zmm1 {%k1} {z}
+; SKX-NEXT:    vprorq $7, %zmm0, %zmm2
+; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; SKX-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2)
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6>, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>, <8 x i64> %x1, i8 -1)
-  %res3 = add <8 x i64> %res, %res1
-  %res4 = add <8 x i64> %res3, %res2
-  ret <8 x i64> %res4
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
 }
 
 ; Tests showing replacement of out-of-bounds variable rotates with in-bounds immediate splat versions.
 
-define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
 ; KNL-LABEL: test_splat_bounds_rol_v16i32:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vprold $1, %zmm0, %zmm1 {%k1}
-; KNL-NEXT:    vprold $31, %zmm0, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vprold $30, %zmm0, %zmm0
-; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    vprold $1, %zmm0, %zmm3 {%k1}
+; KNL-NEXT:    vprold $31, %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vprold $30, %zmm0, %zmm2
+; KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_splat_bounds_rol_v16i32:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vprold $1, %zmm0, %zmm1 {%k1}
-; SKX-NEXT:    vprold $31, %zmm0, %zmm2 {%k1} {z}
-; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vprold $30, %zmm0, %zmm0
-; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    vprold $1, %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    vprold $31, %zmm0, %zmm1 {%k1} {z}
+; SKX-NEXT:    vprold $30, %zmm0, %zmm2
+; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; SKX-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534>, <16 x i32> %x1, i16 -1)
-  %res3 = add <16 x i32> %res, %res1
-  %res4 = add <16 x i32> %res3, %res2
-  ret <16 x i32> %res4
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
 }
 
-define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
 ; KNL-LABEL: test_splat_bounds_rol_v8i64:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vprolq $62, %zmm0, %zmm1 {%k1}
-; KNL-NEXT:    vprolq $1, %zmm0, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vprolq $63, %zmm0, %zmm0
-; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    vprolq $62, %zmm0, %zmm3 {%k1}
+; KNL-NEXT:    vprolq $1, %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vprolq $63, %zmm0, %zmm2
+; KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_splat_bounds_rol_v8i64:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vprolq $62, %zmm0, %zmm1 {%k1}
-; SKX-NEXT:    vprolq $1, %zmm0, %zmm2 {%k1} {z}
-; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vprolq $63, %zmm0, %zmm0
-; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    vprolq $62, %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    vprolq $1, %zmm0, %zmm1 {%k1} {z}
+; SKX-NEXT:    vprolq $63, %zmm0, %zmm2
+; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; SKX-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2)
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65>, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x1, i8 -1)
-  %res3 = add <8 x i64> %res, %res1
-  %res4 = add <8 x i64> %res3, %res2
-  ret <8 x i64> %res4
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
 }
 
-define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
 ; KNL-LABEL: test_splat_bounds_ror_v16i32:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vprord $1, %zmm0, %zmm1 {%k1}
-; KNL-NEXT:    vprord $31, %zmm0, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vprord $30, %zmm0, %zmm0
-; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    vprord $1, %zmm0, %zmm3 {%k1}
+; KNL-NEXT:    vprord $31, %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vprord $30, %zmm0, %zmm2
+; KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_splat_bounds_ror_v16i32:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vprord $1, %zmm0, %zmm1 {%k1}
-; SKX-NEXT:    vprord $31, %zmm0, %zmm2 {%k1} {z}
-; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vprord $30, %zmm0, %zmm0
-; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    vprord $1, %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    vprord $31, %zmm0, %zmm1 {%k1} {z}
+; SKX-NEXT:    vprord $30, %zmm0, %zmm2
+; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; SKX-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534>, <16 x i32> %x1, i16 -1)
-  %res3 = add <16 x i32> %res, %res1
-  %res4 = add <16 x i32> %res3, %res2
-  ret <16 x i32> %res4
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
 }
 
-define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
 ; KNL-LABEL: test_splat_bounds_ror_v8i64:
 ; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vprorq $62, %zmm0, %zmm1 {%k1}
-; KNL-NEXT:    vprorq $1, %zmm0, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vprorq $63, %zmm0, %zmm0
-; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    vprorq $62, %zmm0, %zmm3 {%k1}
+; KNL-NEXT:    vprorq $1, %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vprorq $63, %zmm0, %zmm2
+; KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_splat_bounds_ror_v8i64:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vprorq $62, %zmm0, %zmm1 {%k1}
-; SKX-NEXT:    vprorq $1, %zmm0, %zmm2 {%k1} {z}
-; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vprorq $63, %zmm0, %zmm0
-; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    vprorq $62, %zmm0, %zmm3 {%k1}
+; SKX-NEXT:    vprorq $1, %zmm0, %zmm1 {%k1} {z}
+; SKX-NEXT:    vprorq $63, %zmm0, %zmm2
+; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; SKX-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2)
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65>, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x1, i8 -1)
-  %res3 = add <8 x i64> %res, %res1
-  %res4 = add <8 x i64> %res3, %res2
-  ret <8 x i64> %res4
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
 }
 
 ; Constant folding
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -46,61 +46,60 @@
 
 declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
 
-define <64 x i8> @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) nounwind {
+define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0xc1]
-; X86-NEXT:    vmovdqu8 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd1]
-; X86-NEXT:    vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2]
-; X86-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
+; X86-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x5c,0x24,0x04]
+; X86-NEXT:    vpblendmb %zmm3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0xcb]
+; X86-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd3]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastb %edi, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xcf]
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc8]
+; X64-NEXT:    vpbroadcastb %edi, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xc7]
 ; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
-; X64-NEXT:    vpbroadcastb %edi, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7a,0xc7]
+; X64-NEXT:    vpbroadcastb %edi, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7a,0xcf]
 ; X64-NEXT:    vpbroadcastb %edi, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xd7]
-; X64-NEXT:    vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2]
-; X64-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
     %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask)
     %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask)
-    %res3 = add <64 x i8> %res, %res1
-    %res4 = add <64 x i8> %res2, %res3
-    ret <64 x i8> %res4
-  }
+    %res3 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8>  %res, 0
+    %res4 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }  %res3, <64 x i8> %res1, 1
+    %res5 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }  %res4, <64 x i8> %res2, 2
+    ret { <64 x i8>, <64 x i8>, <64 x i8> } %res5
+}
 
 declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32)
-define <32 x i16> @test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) nounwind {
+
+define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x4c,0x24,0x02]
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x5c,0x24,0x02]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1]
-; X86-NEXT:    vmovdqu16 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xd1]
-; X86-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
-; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    vpblendmw %zmm3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x66,0xcb]
+; X86-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xd3]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastw %edi, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xcf]
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc8]
+; X64-NEXT:    vpbroadcastw %edi, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xc7]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpbroadcastw %edi, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7b,0xc7]
+; X64-NEXT:    vpbroadcastw %edi, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7b,0xcf]
 ; X64-NEXT:    vpbroadcastw %edi, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xd7]
-; X64-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
-; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-    %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
-    %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask)
-   %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask)
-    %res3 = add <32 x i16> %res, %res1
-   %res4 = add <32 x i16> %res2, %res3
-    ret <32 x i16> %res4
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask)
+  %res3 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16>  %res, 0
+  %res4 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res3, <32 x i16> %res1, 1
+  %res5 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res4, <32 x i16> %res2, 2
+  ret { <32 x i16>, <32 x i16>, <32 x i16> } %res5
  }
 
 declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)
@@ -155,77 +154,78 @@
 
 declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
 
-define <32 x i16> @test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) nounwind {
+define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0x00]
-; X86-NEXT:    vmovdqu16 (%ecx), %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0x09]
-; X86-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; X86-NEXT:    vpblendmw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x66,0x08]
+; X86-NEXT:    vmovdqu16 (%ecx), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0x11]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_w_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqu64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07]
 ; X64-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
-; X64-NEXT:    vmovdqu16 (%rsi), %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0x06]
-; X64-NEXT:    vmovdqu16 (%rdi), %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0x0f]
-; X64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; X64-NEXT:    vpblendmw (%rsi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x66,0x0e]
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0x17]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res0 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1)
-  %res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res0, i32 %mask)
-  %res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask)
-  %res2 = add <32 x i16> %res, %res1
-  ret <32 x i16> %res2
+  %res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res, i32 %mask)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask)
+  %res3 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16>  %res, 0
+  %res4 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res3, <32 x i16> %res1, 1
+  %res5 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res4, <32 x i16> %res2, 2
+  ret { <32 x i16>, <32 x i16>, <32 x i16> } %res5
 }
 
 declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
 
-define <64 x i8> @test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) nounwind {
+define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
-; X86-NEXT:    vmovdqu8 (%eax), %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0x00]
-; X86-NEXT:    vmovdqu8 (%ecx), %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x09]
-; X86-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; X86-NEXT:    vpblendmb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0x08]
+; X86-NEXT:    vmovdqu8 (%ecx), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x11]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_b_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqu64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07]
 ; X64-NEXT:    kmovq %rdx, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xca]
-; X64-NEXT:    vmovdqu8 (%rsi), %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0x06]
-; X64-NEXT:    vmovdqu8 (%rdi), %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x0f]
-; X64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; X64-NEXT:    vpblendmb (%rsi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0x0e]
+; X64-NEXT:    vmovdqu8 (%rdi), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x17]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res0 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1)
-  %res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res0, i64 %mask)
-  %res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask)
-  %res2 = add <64 x i8> %res, %res1
-  ret <64 x i8> %res2
+  %res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1)
+  %res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res, i64 %mask)
+  %res2 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask)
+  %res3 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8>  %res, 0
+  %res4 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }  %res3, <64 x i8> %res1, 1
+  %res5 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }  %res4, <64 x i8> %res2, 2
+  ret { <64 x i8>, <64 x i8>, <64 x i8> } %res5
 }
 
 declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
 
-define <8 x i64> @test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) nounwind {
+define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psll_dq_512:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpslldq $8, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xf8,0x08]
-; CHECK-NEXT:    # zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
-; CHECK-NEXT:    vpslldq $4, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0xf8,0x04]
-; CHECK-NEXT:    # zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; CHECK-NEXT:    vpslldq $8, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x73,0xf8,0x08]
+; CHECK-NEXT:    # zmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
+; CHECK-NEXT:    vpslldq $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xf8,0x04]
+; CHECK-NEXT:    # zmm1 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
-  %res2 = add <8 x i64> %res, %res1
-  ret <8 x i64> %res2
+  %res2 = insertvalue { <8 x i64>, <8 x i64> } poison, <8 x i64>  %res, 0
+  %res3 = insertvalue { <8 x i64>, <8 x i64> }  %res2, <8 x i64> %res1, 1
+  ret { <8 x i64>, <8 x i64> } %res3
 }
 
 define <8 x i64> @test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) nounwind {
@@ -248,19 +248,20 @@
 
 declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
 
-define <8 x i64> @test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) nounwind {
+define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psrl_dq_512:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsrldq $8, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xd8,0x08]
-; CHECK-NEXT:    # zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpsrldq $4, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0xd8,0x04]
-; CHECK-NEXT:    # zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; CHECK-NEXT:    vpsrldq $8, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x73,0xd8,0x08]
+; CHECK-NEXT:    # zmm2 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xd8,0x04]
+; CHECK-NEXT:    # zmm1 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
-  %res2 = add <8 x i64> %res, %res1
-  ret <8 x i64> %res2
+  %res2 = insertvalue { <8 x i64>, <8 x i64> } poison, <8 x i64>  %res, 0
+  %res3 = insertvalue { <8 x i64>, <8 x i64> }  %res2, <8 x i64> %res1, 1
+  ret { <8 x i64>, <8 x i64> } %res3
 }
 
 define <8 x i64> @test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) nounwind {
@@ -1089,32 +1090,33 @@
 
 declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
-define <32 x i16> @test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
+define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xd0,0x03]
-; X86-NEXT:    vpsrlw $4, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xd0,0x04]
-; X86-NEXT:    vpsrlw $5, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xd0,0x05]
-; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
-; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    vpsrlw $3, %zmm0, %zmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x49,0x71,0xd0,0x03]
+; X86-NEXT:    vpsrlw $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x71,0xd0,0x04]
+; X86-NEXT:    vpsrlw $5, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xc9,0x71,0xd0,0x05]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xd0,0x03]
-; X64-NEXT:    vpsrlw $4, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xd0,0x04]
-; X64-NEXT:    vpsrlw $5, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xd0,0x05]
-; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
-; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    vpsrlw $3, %zmm0, %zmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x49,0x71,0xd0,0x03]
+; X64-NEXT:    vpsrlw $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x71,0xd0,0x04]
+; X64-NEXT:    vpsrlw $5, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xc9,0x71,0xd0,0x05]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 4, <32 x i16> %x2, i32 -1)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 5, <32 x i16> zeroinitializer, i32 %x3)
-  %res3 = add <32 x i16> %res, %res1
-  %res4 = add <32 x i16> %res3, %res2
-  ret <32 x i16> %res4
+  %res3 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16>  %res, 0
+  %res4 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res3, <32 x i16> %res1, 1
+  %res5 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res4, <32 x i16> %res2, 2
+  ret { <32 x i16>, <32 x i16>, <32 x i16> } %res5
 }
 
 declare <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
@@ -1164,32 +1166,33 @@
 
 declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
-define <32 x i16> @test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
+define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psra_wi_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xe0,0x03]
-; X86-NEXT:    vpsraw $4, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xc9,0x71,0xe0,0x04]
-; X86-NEXT:    vpsraw $5, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x71,0xe0,0x05]
-; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
-; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    vpsraw $3, %zmm0, %zmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x49,0x71,0xe0,0x03]
+; X86-NEXT:    vpsraw $4, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0xc9,0x71,0xe0,0x04]
+; X86-NEXT:    vpsraw $5, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xe0,0x05]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psra_wi_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xe0,0x03]
-; X64-NEXT:    vpsraw $4, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xc9,0x71,0xe0,0x04]
-; X64-NEXT:    vpsraw $5, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x71,0xe0,0x05]
-; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
-; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    vpsraw $3, %zmm0, %zmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x49,0x71,0xe0,0x03]
+; X64-NEXT:    vpsraw $4, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0xc9,0x71,0xe0,0x04]
+; X64-NEXT:    vpsraw $5, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xe0,0x05]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 4, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 5, <32 x i16> %x2, i32 -1)
-  %res3 = add <32 x i16> %res, %res1
-  %res4 = add <32 x i16> %res3, %res2
-  ret <32 x i16> %res4
+  %res3 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16>  %res, 0
+  %res4 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res3, <32 x i16> %res1, 1
+  %res5 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res4, <32 x i16> %res2, 2
+  ret { <32 x i16>, <32 x i16>, <32 x i16> } %res5
 }
 
 declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
@@ -1239,32 +1242,33 @@
 
 declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
-define <32 x i16> @test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
+define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psll_wi_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xf0,0x03]
-; X86-NEXT:    vpsllw $4, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xc9,0x71,0xf0,0x04]
-; X86-NEXT:    vpsllw $5, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x71,0xf0,0x05]
-; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
-; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    vpsllw $3, %zmm0, %zmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x49,0x71,0xf0,0x03]
+; X86-NEXT:    vpsllw $4, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0xc9,0x71,0xf0,0x04]
+; X86-NEXT:    vpsllw $5, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xf0,0x05]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psll_wi_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xf0,0x03]
-; X64-NEXT:    vpsllw $4, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xc9,0x71,0xf0,0x04]
-; X64-NEXT:    vpsllw $5, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x71,0xf0,0x05]
-; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
-; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    vpsllw $3, %zmm0, %zmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x49,0x71,0xf0,0x03]
+; X64-NEXT:    vpsllw $4, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0xc9,0x71,0xf0,0x04]
+; X64-NEXT:    vpsllw $5, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xf0,0x05]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 4, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 5, <32 x i16> %x2, i32 -1)
-  %res3 = add <32 x i16> %res, %res1
-  %res4 = add <32 x i16> %res3, %res2
-  ret <32 x i16> %res4
+  %res3 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16>  %res, 0
+  %res4 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res3, <32 x i16> %res1, 1
+  %res5 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res4, <32 x i16> %res2, 2
+  ret { <32 x i16>, <32 x i16>, <32 x i16> } %res5
 }
 
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
@@ -3003,32 +3007,35 @@
 
 declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
 
-define <32 x i16> @test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) nounwind {
+define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xd1,0x02]
+; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02]
 ; X86-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03]
-; X86-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xc1,0x04]
-; X86-NEXT:    vpaddw %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfd,0xc0]
-; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04]
+; X86-NEXT:    vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xd1,0x02]
+; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02]
 ; X64-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03]
-; X64-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xc1,0x04]
-; X64-NEXT:    vpaddw %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfd,0xc0]
-; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04]
+; X64-NEXT:    vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 3, <32 x i16> zeroinitializer, i32 %x4)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 4, <32 x i16> %x3, i32 -1)
-  %res3 = add <32 x i16> %res, %res1
-  %res4 = add <32 x i16> %res3, %res2
-  ret <32 x i16> %res4
+  %res3 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16>  %res, 0
+  %res4 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res3, <32 x i16> %res1, 1
+  %res5 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res4, <32 x i16> %res2, 2
+  ret { <32 x i16>, <32 x i16>, <32 x i16> } %res5
 }
 
 define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1238,25 +1238,27 @@
 
 declare <32 x i16> @llvm.x86.avx512.dbpsadbw.512(<64 x i8>, <64 x i8>, i32)
 
-define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
+define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xd1,0x02]
+; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02]
 ; X86-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03]
-; X86-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xc1,0x04]
-; X86-NEXT:    vpaddw %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfd,0xc0]
-; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04]
+; X86-NEXT:    vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xd1,0x02]
+; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02]
 ; X64-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03]
-; X64-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xc1,0x04]
-; X64-NEXT:    vpaddw %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfd,0xc0]
-; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04]
+; X64-NEXT:    vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2)
   %2 = bitcast i32 %x4 to <32 x i1>
@@ -1265,24 +1267,26 @@
   %5 = bitcast i32 %x4 to <32 x i1>
   %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer
   %7 = call <32 x i16> @llvm.x86.avx512.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 4)
-  %res3 = add <32 x i16> %3, %6
-  %res4 = add <32 x i16> %res3, %7
-  ret <32 x i16> %res4
+  %res1 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16> %3, 0
+  %res2 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res1, <32 x i16> %6, 1
+  %res3 = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> }  %res2, <32 x i16> %7, 2
+  ret { <32 x i16>, <32 x i16>, <32 x i16> } %res3
 }
 
 declare  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
 
-define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
+define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
 ; CHECK-LABEL: test_int_x86_avx512_mask_psadb_w_512:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x7d,0x48,0xf6,0xc9]
-; CHECK-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xf6,0xc2]
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; CHECK-NEXT:    vpsadbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf6,0xd9]
+; CHECK-NEXT:    vpsadbw %zmm2, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x7d,0x48,0xf6,0xca]
+; CHECK-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
+  %res0 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
   %res1 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
-  %res2 = add  <8 x i64> %res, %res1
-  ret  <8 x i64> %res2
+  %res2 = insertvalue { <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res3 = insertvalue { <8 x i64>, <8 x i64> }  %res2, <8 x i64> %res1, 1
+  ret { <8 x i64>, <8 x i64> } %res3
 }
 
 declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -4,127 +4,123 @@
 
 declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)
 
-define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) {
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0x4c,0x24,0x04]
+; X86-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0x5c,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x6f,0xc1]
-; X86-NEXT:    vmovdqu8 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0xd1]
-; X86-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
-; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
+; X86-NEXT:    vpblendmb %xmm3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x66,0xcb]
+; X86-NEXT:    vmovdqu8 %xmm3, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0xd3]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastb %edi, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf]
+; X64-NEXT:    vmovdqa %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc8]
+; X64-NEXT:    vpbroadcastb %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xc7]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
+; X64-NEXT:    vpbroadcastb %edi, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xcf]
 ; X64-NEXT:    vpbroadcastb %edi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd7]
-; X64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
-; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
   %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask)
   %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask)
-  %res3 = add <16 x i8> %res, %res1
-  %res4 = add <16 x i8> %res2, %res3
-  ret <16 x i8> %res4
+  %res3 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %res0, 0
+  %res4 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res3, <16 x i8> %res1, 1
+  %res5 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res4, <16 x i8> %res2, 2
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %res5
 }
 
 
 declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8)
 
-define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x4c,0x24,0x04]
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x5c,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1]
-; X86-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xd1]
-; X86-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
-; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; X86-NEXT:    vpblendmw %xmm3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x66,0xcb]
+; X86-NEXT:    vmovdqu16 %xmm3, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xd3]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastw %edi, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf]
+; X64-NEXT:    vmovdqa %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc8]
+; X64-NEXT:    vpbroadcastw %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xc7]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
+; X64-NEXT:    vpbroadcastw %edi, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xcf]
 ; X64-NEXT:    vpbroadcastw %edi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd7]
-; X64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
-; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask)
   %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask)
-  %res3 = add <8 x i16> %res, %res1
-  %res4 = add <8 x i16> %res2, %res3
-  ret <8 x i16> %res4
+  %res3 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> %res0, 0
+  %res4 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res3, <8 x i16> %res1, 1
+  %res5 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res4, <8 x i16> %res2, 2
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %res5
 }
 
 
 declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32)
 
-define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) {
+define { <32 x i8>, <32 x i8>, <32 x i8> } @test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0x4c,0x24,0x04]
+; X86-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0x5c,0x24,0x04]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x6f,0xc1]
-; X86-NEXT:    vmovdqu8 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0xd1]
-; X86-NEXT:    vpaddb %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2]
-; X86-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
+; X86-NEXT:    vpblendmb %ymm3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x66,0xcb]
+; X86-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0xd3]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastb %edi, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf]
+; X64-NEXT:    vmovdqa %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc8]
+; X64-NEXT:    vpbroadcastb %edi, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xc7]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
+; X64-NEXT:    vpbroadcastb %edi, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xcf]
 ; X64-NEXT:    vpbroadcastb %edi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd7]
-; X64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2]
-; X64-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
+  %res0 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
   %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask)
   %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask)
-  %res3 = add <32 x i8> %res, %res1
-  %res4 = add <32 x i8> %res2, %res3
-  ret <32 x i8> %res4
+  %res3 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> %res0, 0
+  %res4 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }  %res3, <32 x i8> %res1, 1
+  %res5 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }  %res4, <32 x i8> %res2, 2
+  ret { <32 x i8>, <32 x i8>, <32 x i8> } %res5
 }
 
 
 
 declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16)
 
-define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) {
+define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x4c,0x24,0x04]
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x5c,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1]
-; X86-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xd1]
-; X86-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
-; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; X86-NEXT:    vpblendmw %ymm3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x66,0xcb]
+; X86-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xd3]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastw %edi, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf]
+; X64-NEXT:    vmovdqa %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc8]
+; X64-NEXT:    vpbroadcastw %edi, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xc7]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
+; X64-NEXT:    vpbroadcastw %edi, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xcf]
 ; X64-NEXT:    vpbroadcastw %edi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd7]
-; X64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
-; X64-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask)
-  %res3 = add <16 x i16> %res, %res1
-  %res4 = add <16 x i16> %res2, %res3
-  ret <16 x i16> %res4
+  %res3 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> %res, 0
+  %res4 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res3, <16 x i16> %res1, 1
+  %res5 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res4, <16 x i16> %res2, 2
+  ret { <16 x i16>, <16 x i16>, <16 x i16> } %res5
 }
 
 declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
@@ -498,7 +494,7 @@
 
 declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8*, <8 x i16>, i8)
 
-define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) {
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_w_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
@@ -506,111 +502,111 @@
 ; X86-NEXT:    vmovdqu (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
 ; X86-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
-; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0x00]
-; X86-NEXT:    vmovdqu16 (%ecx), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0x09]
-; X86-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; X86-NEXT:    vpblendmw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x66,0x08]
+; X86-NEXT:    vmovdqu16 (%ecx), %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0x11]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_w_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqu (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
 ; X64-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
-; X64-NEXT:    vmovdqu16 (%rsi), %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0x06]
-; X64-NEXT:    vmovdqu16 (%rdi), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0x0f]
-; X64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; X64-NEXT:    vpblendmw (%rsi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x66,0x0e]
+; X64-NEXT:    vmovdqu16 (%rdi), %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0x17]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> %x1, i8 -1)
-    %res = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr2, <8 x i16> %res0, i8 %mask)
-    %res1 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> zeroinitializer, i8 %mask)
-    %res2 = add <8 x i16> %res, %res1
-    ret <8 x i16> %res2
+    %res1 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr2, <8 x i16> %res0, i8 %mask)
+    %res2 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> zeroinitializer, i8 %mask)
+    %res3 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> %res0, 0
+    %res4 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res3, <8 x i16> %res1, 1
+    %res5 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res4, <8 x i16> %res2, 2
+    ret { <8 x i16>, <8 x i16>, <8 x i16> } %res5
 }
 
 declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8*, <16 x i16>, i16)
 
-define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) {
+define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_w_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vmovdqu (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x01]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0x00]
-; X86-NEXT:    vmovdqu16 (%ecx), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x09]
-; X86-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
+; X86-NEXT:    vpblendmw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x66,0x08]
+; X86-NEXT:    vmovdqu16 (%ecx), %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x11]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_w_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqu (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
 ; X64-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
-; X64-NEXT:    vmovdqu16 (%rsi), %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0x06]
-; X64-NEXT:    vmovdqu16 (%rdi), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x0f]
-; X64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
+; X64-NEXT:    vpblendmw (%rsi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x66,0x0e]
+; X64-NEXT:    vmovdqu16 (%rdi), %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x17]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> %x1, i16 -1)
-    %res = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr2, <16 x i16> %res0, i16 %mask)
-    %res1 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> zeroinitializer, i16 %mask)
-    %res2 = add <16 x i16> %res, %res1
-    ret <16 x i16> %res2
+    %res1 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr2, <16 x i16> %res0, i16 %mask)
+    %res2 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> zeroinitializer, i16 %mask)
+    %res3 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> %res0, 0
+    %res4 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res3, <16 x i16> %res1, 1
+    %res5 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res4, <16 x i16> %res2, 2
+    ret { <16 x i16>, <16 x i16>, <16 x i16> } %res5
 }
 
 declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8*, <16 x i8>, i16)
 
-define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) {
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vmovdqu (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu8 (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x00]
-; X86-NEXT:    vmovdqu8 (%ecx), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x09]
-; X86-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
+; X86-NEXT:    vpblendmb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x66,0x08]
+; X86-NEXT:    vmovdqu8 (%ecx), %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x11]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_b_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqu (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
 ; X64-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
-; X64-NEXT:    vmovdqu8 (%rsi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x06]
-; X64-NEXT:    vmovdqu8 (%rdi), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x0f]
-; X64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
+; X64-NEXT:    vpblendmb (%rsi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x66,0x0e]
+; X64-NEXT:    vmovdqu8 (%rdi), %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x17]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> %x1, i16 -1)
-    %res = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr2, <16 x i8> %res0, i16 %mask)
-    %res1 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> zeroinitializer, i16 %mask)
-    %res2 = add <16 x i8> %res, %res1
-    ret <16 x i8> %res2
+    %res1 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr2, <16 x i8> %res0, i16 %mask)
+    %res2 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> zeroinitializer, i16 %mask)
+    %res3 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %res0, 0
+    %res4 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res3, <16 x i8> %res1, 1
+    %res5 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res4, <16 x i8> %res2, 2
+    ret { <16 x i8>, <16 x i8>, <16 x i8> } %res5
 }
 
 declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8*, <32 x i8>, i32)
 
-define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) {
+define { <32 x i8>, <32 x i8>, <32 x i8> } @test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vmovdqu (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x01]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu8 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x00]
-; X86-NEXT:    vmovdqu8 (%ecx), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x09]
-; X86-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
+; X86-NEXT:    vpblendmb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x66,0x08]
+; X86-NEXT:    vmovdqu8 (%ecx), %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x11]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_b_256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqu (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
 ; X64-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
-; X64-NEXT:    vmovdqu8 (%rsi), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x06]
-; X64-NEXT:    vmovdqu8 (%rdi), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x0f]
-; X64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
+; X64-NEXT:    vpblendmb (%rsi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x66,0x0e]
+; X64-NEXT:    vmovdqu8 (%rdi), %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x17]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> %x1, i32 -1)
-    %res = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr2, <32 x i8> %res0, i32 %mask)
-    %res1 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> zeroinitializer, i32 %mask)
-    %res2 = add <32 x i8> %res, %res1
-    ret <32 x i8> %res2
+    %res1 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr2, <32 x i8> %res0, i32 %mask)
+    %res2 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> zeroinitializer, i32 %mask)
+    %res3 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> %res0, 0
+    %res4 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }  %res3, <32 x i8> %res1, 1
+    %res5 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }  %res4, <32 x i8> %res2, 2
+    ret { <32 x i8>, <32 x i8>, <32 x i8> } %res5
 }
 
 declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16)
@@ -3151,185 +3147,191 @@
 
 declare <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16>, i32, <8 x i16>, i8)
 
-define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psrl_wi_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpsrlw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; X86-NEXT:    vpsrlw $4, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x04]
-; X86-NEXT:    vpsrlw $5, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x05]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; X86-NEXT:    vpsrlw $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x09,0x71,0xd0,0x03]
+; X86-NEXT:    vpsrlw $4, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xd0,0x04]
+; X86-NEXT:    vpsrlw $5, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0x89,0x71,0xd0,0x05]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psrl_wi_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsrlw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; X64-NEXT:    vpsrlw $4, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x04]
-; X64-NEXT:    vpsrlw $5, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x05]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; X64-NEXT:    vpsrlw $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x09,0x71,0xd0,0x03]
+; X64-NEXT:    vpsrlw $4, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xd0,0x04]
+; X64-NEXT:    vpsrlw $5, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0x89,0x71,0xd0,0x05]
+; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 4, <8 x i16> %x2, i8 -1)
   %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 5, <8 x i16> zeroinitializer, i8 %x3)
-  %res3 = add <8 x i16> %res, %res1
-  %res4 = add <8 x i16> %res2, %res3
-  ret <8 x i16> %res4
+  %res3 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> %res0, 0
+  %res4 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res3, <8 x i16> %res1, 1
+  %res5 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res4, <8 x i16> %res2, 2
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %res5
 }
 
 declare <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16>, i32, <16 x i16>, i16)
 
-define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
+define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psrl_wi_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vpsrlw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; X86-NEXT:    vpsrlw $4, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x04]
-; X86-NEXT:    vpsrlw $5, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x05]
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
-; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; X86-NEXT:    vpsrlw $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0x65,0x29,0x71,0xd0,0x03]
+; X86-NEXT:    vpsrlw $4, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x71,0xd0,0x04]
+; X86-NEXT:    vpsrlw $5, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xd0,0x05]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psrl_wi_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsrlw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; X64-NEXT:    vpsrlw $4, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x04]
-; X64-NEXT:    vpsrlw $5, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x05]
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
-; X64-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; X64-NEXT:    vpsrlw $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0x65,0x29,0x71,0xd0,0x03]
+; X64-NEXT:    vpsrlw $4, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x71,0xd0,0x04]
+; X64-NEXT:    vpsrlw $5, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xd0,0x05]
+; X64-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+  %res0 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 4, <16 x i16> %x2, i16 -1)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 5, <16 x i16> zeroinitializer, i16 %x3)
-  %res3 = add <16 x i16> %res, %res1
-  %res4 = add <16 x i16> %res3, %res2
-  ret <16 x i16> %res4
+  %res3 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> %res0, 0
+  %res4 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res3, <16 x i16> %res1, 1
+  %res5 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res4, <16 x i16> %res2, 2
+  ret { <16 x i16>, <16 x i16>, <16 x i16> } %res5
 }
 
 declare <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16>, i32, <8 x i16>, i8)
 
-define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psra_wi_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpsraw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03]
-; X86-NEXT:    vpsraw $4, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0x89,0x71,0xe0,0x04]
-; X86-NEXT:    vpsraw $5, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x05]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; X86-NEXT:    vpsraw $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x09,0x71,0xe0,0x03]
+; X86-NEXT:    vpsraw $4, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0x89,0x71,0xe0,0x04]
+; X86-NEXT:    vpsraw $5, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xe0,0x05]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psra_wi_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsraw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03]
-; X64-NEXT:    vpsraw $4, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0x89,0x71,0xe0,0x04]
-; X64-NEXT:    vpsraw $5, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x05]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; X64-NEXT:    vpsraw $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x09,0x71,0xe0,0x03]
+; X64-NEXT:    vpsraw $4, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0x89,0x71,0xe0,0x04]
+; X64-NEXT:    vpsraw $5, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xe0,0x05]
+; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 4, <8 x i16> zeroinitializer, i8 %x3)
   %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 5, <8 x i16> %x2, i8 -1)
-  %res3 = add <8 x i16> %res, %res1
-  %res4 = add <8 x i16> %res3, %res2
-  ret <8 x i16> %res4
+  %res3 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> %res0, 0
+  %res4 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res3, <8 x i16> %res1, 1
+  %res5 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res4, <8 x i16> %res2, 2
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %res5
 }
 
 declare <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16>, i32, <16 x i16>, i16)
 
-define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
+define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psra_wi_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vpsraw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03]
-; X86-NEXT:    vpsraw $4, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xe0,0x04]
-; X86-NEXT:    vpsraw $5, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x05]
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
-; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; X86-NEXT:    vpsraw $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0x65,0x29,0x71,0xe0,0x03]
+; X86-NEXT:    vpsraw $4, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0xa9,0x71,0xe0,0x04]
+; X86-NEXT:    vpsraw $5, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xe0,0x05]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psra_wi_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsraw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03]
-; X64-NEXT:    vpsraw $4, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xe0,0x04]
-; X64-NEXT:    vpsraw $5, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x05]
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
-; X64-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; X64-NEXT:    vpsraw $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0x65,0x29,0x71,0xe0,0x03]
+; X64-NEXT:    vpsraw $4, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0xa9,0x71,0xe0,0x04]
+; X64-NEXT:    vpsraw $5, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xe0,0x05]
+; X64-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 4, <16 x i16> zeroinitializer, i16 %x3)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 5, <16 x i16> %x2, i16 -1)
-  %res3 = add <16 x i16> %res, %res1
-  %res4 = add <16 x i16> %res3, %res2
-  ret <16 x i16> %res4
+  %res3 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> %res, 0
+  %res4 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res3, <16 x i16> %res1, 1
+  %res5 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res4, <16 x i16> %res2, 2
+  ret { <16 x i16>, <16 x i16>, <16 x i16> } %res5
 }
 
 declare <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16>, i32, <8 x i16>, i8)
 
-define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psll_wi_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpsllw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03]
-; X86-NEXT:    vpsllw $4, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0x89,0x71,0xf0,0x04]
-; X86-NEXT:    vpsllw $5, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x05]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; X86-NEXT:    vpsllw $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x09,0x71,0xf0,0x03]
+; X86-NEXT:    vpsllw $4, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0x89,0x71,0xf0,0x04]
+; X86-NEXT:    vpsllw $5, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xf0,0x05]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psll_wi_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsllw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03]
-; X64-NEXT:    vpsllw $4, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0x89,0x71,0xf0,0x04]
-; X64-NEXT:    vpsllw $5, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x05]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; X64-NEXT:    vpsllw $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x09,0x71,0xf0,0x03]
+; X64-NEXT:    vpsllw $4, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0x89,0x71,0xf0,0x04]
+; X64-NEXT:    vpsllw $5, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xf0,0x05]
+; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 4, <8 x i16> zeroinitializer, i8 %x3)
   %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 5, <8 x i16> %x2, i8 -1)
-  %res3 = add <8 x i16> %res, %res1
-  %res4 = add <8 x i16> %res3, %res2
-  ret <8 x i16> %res4
+  %res3 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> %res0, 0
+  %res4 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res3, <8 x i16> %res1, 1
+  %res5 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res4, <8 x i16> %res2, 2
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %res5
 }
 
 declare <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16>, i32, <16 x i16>, i16)
 
-define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
+define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psll_wi_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vpsllw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03]
-; X86-NEXT:    vpsllw $4, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xf0,0x04]
-; X86-NEXT:    vpsllw $5, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x05]
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
-; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; X86-NEXT:    vpsllw $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0x65,0x29,0x71,0xf0,0x03]
+; X86-NEXT:    vpsllw $4, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0xa9,0x71,0xf0,0x04]
+; X86-NEXT:    vpsllw $5, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xf0,0x05]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psll_wi_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
 ; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
-; X64-NEXT:    vpsllw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03]
-; X64-NEXT:    vpsllw $4, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xf0,0x04]
-; X64-NEXT:    vpsllw $5, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x05]
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
-; X64-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; X64-NEXT:    vpsllw $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0x65,0x29,0x71,0xf0,0x03]
+; X64-NEXT:    vpsllw $4, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x75,0xa9,0x71,0xf0,0x04]
+; X64-NEXT:    vpsllw $5, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xf0,0x05]
+; X64-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 4, <16 x i16> zeroinitializer, i16 %x3)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 5, <16 x i16> %x2, i16 -1)
-  %res3 = add <16 x i16> %res, %res1
-  %res4 = add <16 x i16> %res3, %res2
-  ret <16 x i16> %res4
+  %res3 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> %res, 0
+  %res4 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res3, <16 x i16> %res1, 1
+  %res5 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res4, <16 x i16> %res2, 2
+  ret { <16 x i16>, <16 x i16>, <16 x i16> } %res5
 }
 
 declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
@@ -6925,63 +6927,69 @@
 
 declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8)
 
-define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
+; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02]
 ; X86-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03]
-; X86-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xc1,0x04]
-; X86-NEXT:    vpaddw %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; X86-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04]
+; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
+; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02]
 ; X64-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03]
-; X64-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xc1,0x04]
-; X64-NEXT:    vpaddw %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; X64-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04]
+; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 3, <8 x i16> zeroinitializer, i8 %x4)
   %res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 4, <8 x i16> %x3, i8 -1)
-  %res3 = add <8 x i16> %res, %res1
-  %res4 = add <8 x i16> %res2, %res3
-  ret <8 x i16> %res4
+  %res3 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> %res0, 0
+  %res4 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res3, <8 x i16> %res1, 1
+  %res5 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res4, <8 x i16> %res2, 2
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %res5
 }
 
 declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32, <16 x i16>, i16)
 
-define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
+define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
+; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02]
 ; X86-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03]
-; X86-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xc1,0x04]
-; X86-NEXT:    vpaddw %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc0]
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; X86-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04]
+; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
+; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02]
 ; X64-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03]
-; X64-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xc1,0x04]
-; X64-NEXT:    vpaddw %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc0]
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; X64-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04]
+; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 3, <16 x i16> zeroinitializer, i16 %x4)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 4, <16 x i16> %x3, i16 -1)
-  %res3 = add <16 x i16> %res, %res1
-  %res4 = add <16 x i16> %res3, %res2
-  ret <16 x i16> %res4
+  %res3 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> %res, 0
+  %res4 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res3, <16 x i16> %res1, 1
+  %res5 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res4, <16 x i16> %res2, 2
+  ret { <16 x i16>, <16 x i16>, <16 x i16> } %res5
 }
 
 define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -1512,33 +1512,32 @@
 
 declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
 
-define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovwb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc2]
+; X86-NEXT:    vpmovwb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc3]
 ; X86-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
-; X86-NEXT:    vpmovwb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
+; X86-NEXT:    vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vpmovwb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc2]
+; X64-NEXT:    vpmovwb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc3]
 ; X64-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
-; X64-NEXT:    vpmovwb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
+; X64-NEXT:    vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
+; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
-    %res3 = add <16 x i8> %res0, %res1
-    %res4 = add <16 x i8> %res3, %res2
-    ret <16 x i8> %res4
+    %res3 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %res0,  0
+    %res4 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res3, <16 x i8> %res1, 1
+    %res5 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res4, <16 x i8> %res2, 2
+    ret { <16 x i8>, <16 x i8>, <16 x i8> } %res5
 }
 
 declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
@@ -1566,33 +1565,32 @@
 
 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
 
-define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovswb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc2]
+; X86-NEXT:    vpmovswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc3]
 ; X86-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
-; X86-NEXT:    vpmovswb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
+; X86-NEXT:    vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vpmovswb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc2]
+; X64-NEXT:    vpmovswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc3]
 ; X64-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
-; X64-NEXT:    vpmovswb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
+; X64-NEXT:    vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
+; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
-    %res3 = add <16 x i8> %res0, %res1
-    %res4 = add <16 x i8> %res3, %res2
-    ret <16 x i8> %res4
+    %res3 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %res0,  0
+    %res4 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res3, <16 x i8> %res1, 1
+    %res5 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res4, <16 x i8> %res2, 2
+    ret { <16 x i8>, <16 x i8>, <16 x i8> } %res5
 }
 
 declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
@@ -1620,33 +1618,32 @@
 
 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
 
-define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovuswb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc2]
+; X86-NEXT:    vpmovuswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc3]
 ; X86-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
-; X86-NEXT:    vpmovuswb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
+; X86-NEXT:    vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vpmovuswb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc2]
+; X64-NEXT:    vpmovuswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc3]
 ; X64-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
-; X64-NEXT:    vpmovuswb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
+; X64-NEXT:    vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
+; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
-    %res3 = add <16 x i8> %res0, %res1
-    %res4 = add <16 x i8> %res3, %res2
-    ret <16 x i8> %res4
+    %res3 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %res0,  0
+    %res4 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res3, <16 x i8> %res1, 1
+    %res5 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }  %res4, <16 x i8> %res2, 2
+    ret { <16 x i8>, <16 x i8>, <16 x i8> } %res5
 }
 
 declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
@@ -1990,26 +1987,28 @@
 
 declare <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8>, <16 x i8>, i32)
 
-define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
+; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02]
 ; X86-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03]
-; X86-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xc1,0x04]
-; X86-NEXT:    vpaddw %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; X86-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04]
+; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
+; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02]
 ; X64-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03]
-; X64-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xc1,0x04]
-; X64-NEXT:    vpaddw %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; X64-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04]
+; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -2018,32 +2017,35 @@
   %5 = bitcast i8 %x4 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i16> %4, <8 x i16> zeroinitializer
   %7 = call <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 4)
-  %res3 = add <8 x i16> %3, %6
-  %res4 = add <8 x i16> %7, %res3
-  ret <8 x i16> %res4
+  %res3 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> } poison, <8 x i16> %3, 0
+  %res4 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res3, <8 x i16> %6, 1
+  %res5 = insertvalue { <8 x i16>, <8 x i16>, <8 x i16> }  %res4, <8 x i16> %7, 2
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %res5
 }
 
 declare <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8>, <32 x i8>, i32)
 
-define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
+define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
+; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02]
 ; X86-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03]
-; X86-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xc1,0x04]
-; X86-NEXT:    vpaddw %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc0]
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; X86-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04]
+; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
 ; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
+; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02]
 ; X64-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03]
-; X64-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xc1,0x04]
-; X64-NEXT:    vpaddw %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc0]
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; X64-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04]
+; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2)
   %2 = bitcast i16 %x4 to <16 x i1>
@@ -2052,9 +2054,10 @@
   %5 = bitcast i16 %x4 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i16> %4, <16 x i16> zeroinitializer
   %7 = call <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 4)
-  %res3 = add <16 x i16> %3, %6
-  %res4 = add <16 x i16> %res3, %7
-  ret <16 x i16> %res4
+  %res3 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> } poison, <16 x i16> %3, 0
+  %res4 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res3, <16 x i16> %6, 1
+  %res5 = insertvalue { <16 x i16>, <16 x i16>, <16 x i16> }  %res4, <16 x i16> %7, 2
+  ret { <16 x i16>, <16 x i16>, <16 x i16> } %res5
 }
 
 declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -14,31 +14,32 @@
   ret <8 x i32> %res
 }
 
-define <8 x i32>@test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X86-NEXT:    vpdpbusd (%eax), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x18]
-; X86-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbusd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x00]
+; X86-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X64-NEXT:    vpdpbusd (%rdi), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x1f]
-; X64-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xc2]
-; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusd (%rdi), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x07]
+; X64-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, <8 x i32>* %x2p
-  %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8  %x3)
-  %res2 = add <8 x i32> %res, %res1
-  ret <8 x i32> %res2
+  %res2 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %res0, 0
+  %res3 = insertvalue { <8 x i32>, <8 x i32> }  %res2, <8 x i32> %res1, 1
+  ret { <8 x i32>, <8 x i32> } %res3
 }
 
 declare <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
@@ -53,31 +54,32 @@
   ret <4 x i32> %res
 }
 
-define <4 x i32>@test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X86-NEXT:    vpdpbusd (%eax), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x18]
-; X86-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbusd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x00]
+; X86-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X64-NEXT:    vpdpbusd (%rdi), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x1f]
-; X64-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xc2]
-; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusd (%rdi), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x07]
+; X64-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, <4 x i32>* %x2p
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8  %x3)
-  %res2 = add <4 x i32> %res, %res1
-  ret <4 x i32> %res2
+  %res2 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %res0, 0
+  %res3 = insertvalue { <4 x i32>, <4 x i32> }  %res2, <4 x i32> %res1, 1
+  ret { <4 x i32>, <4 x i32> } %res3
 }
 
 declare <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
@@ -92,31 +94,32 @@
   ret <8 x i32> %res
 }
 
-define <8 x i32>@test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X86-NEXT:    vpdpbusds (%eax), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x18]
-; X86-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbusds (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x00]
+; X86-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X64-NEXT:    vpdpbusds (%rdi), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x1f]
-; X64-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xc2]
-; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusds (%rdi), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x07]
+; X64-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, <8 x i32>* %x2p
-  %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8  %x3)
-  %res2 = add <8 x i32> %res, %res1
-  ret <8 x i32> %res2
+  %res2 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %res0, 0
+  %res3 = insertvalue { <8 x i32>, <8 x i32> }  %res2, <8 x i32> %res1, 1
+  ret { <8 x i32>, <8 x i32> } %res3
 }
 
 declare <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
@@ -131,31 +134,32 @@
   ret <4 x i32> %res
 }
 
-define <4 x i32>@test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X86-NEXT:    vpdpbusds (%eax), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x18]
-; X86-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbusds (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x00]
+; X86-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X64-NEXT:    vpdpbusds (%rdi), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x1f]
-; X64-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xc2]
-; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusds (%rdi), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x07]
+; X64-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, <4 x i32>* %x2p
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8  %x3)
-  %res2 = add <4 x i32> %res, %res1
-  ret <4 x i32> %res2
+  %res2 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %res0, 0
+  %res3 = insertvalue { <4 x i32>, <4 x i32> }  %res2, <4 x i32> %res1, 1
+  ret { <4 x i32>, <4 x i32> } %res3
 }
 
 declare <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
@@ -170,31 +174,32 @@
   ret <8 x i32> %res
 }
 
-define <8 x i32>@test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X86-NEXT:    vpdpwssd (%eax), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x18]
-; X86-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpwssd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x00]
+; X86-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X64-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x1f]
-; X64-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xc2]
-; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x07]
+; X64-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, <8 x i32>* %x2p
-  %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8  %x3)
-  %res2 = add <8 x i32> %res, %res1
-  ret <8 x i32> %res2
+  %res2 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %res0, 0
+  %res3 = insertvalue { <8 x i32>, <8 x i32> }  %res2, <8 x i32> %res1, 1
+  ret { <8 x i32>, <8 x i32> } %res3
 }
 
 declare <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
@@ -209,31 +214,32 @@
   ret <4 x i32> %res
 }
 
-define <4 x i32>@test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X86-NEXT:    vpdpwssd (%eax), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x18]
-; X86-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpwssd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x00]
+; X86-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X64-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x1f]
-; X64-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xc2]
-; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x07]
+; X64-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, <4 x i32>* %x2p
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8  %x3)
-  %res2 = add <4 x i32> %res, %res1
-  ret <4 x i32> %res2
+  %res2 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %res0, 0
+  %res3 = insertvalue { <4 x i32>, <4 x i32> }  %res2, <4 x i32> %res1, 1
+  ret { <4 x i32>, <4 x i32> } %res3
 }
 
 
@@ -249,31 +255,32 @@
   ret <8 x i32> %res
 }
 
-define <8 x i32>@test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X86-NEXT:    vpdpwssds (%eax), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x18]
-; X86-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpwssds (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x00]
+; X86-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X64-NEXT:    vpdpwssds (%rdi), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x1f]
-; X64-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xc2]
-; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssds (%rdi), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x07]
+; X64-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, <8 x i32>* %x2p
-  %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8  %x3)
-  %res2 = add <8 x i32> %res, %res1
-  ret <8 x i32> %res2
+  %res2 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %res0, 0
+  %res3 = insertvalue { <8 x i32>, <8 x i32> }  %res2, <8 x i32> %res1, 1
+  ret { <8 x i32>, <8 x i32> } %res3
 }
 
 declare <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
@@ -288,29 +295,30 @@
   ret <4 x i32> %res
 }
 
-define <4 x i32>@test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X86-NEXT:    vpdpwssds (%eax), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x18]
-; X86-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpwssds (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x00]
+; X86-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X64-NEXT:    vpdpwssds (%rdi), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x1f]
-; X64-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xc2]
-; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssds (%rdi), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x07]
+; X64-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, <4 x i32>* %x2p
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8  %x3)
-  %res2 = add <4 x i32> %res, %res1
-  ret <4 x i32> %res2
+  %res2 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %res0, 0
+  %res3 = insertvalue { <4 x i32>, <4 x i32> }  %res2, <4 x i32> %res1, 1
+  ret { <4 x i32>, <4 x i32> } %res3
 }
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -13,25 +13,25 @@
   ret <8 x i32> %1
 }
 
-define <8 x i32>@test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X86-NEXT:    vpdpbusd (%eax), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x18]
-; X86-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbusd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x00]
+; X86-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X64-NEXT:    vpdpbusd (%rdi), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x1f]
-; X64-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xc2]
-; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusd (%rdi), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x07]
+; X64-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, <8 x i32>* %x2p
   %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
@@ -40,8 +40,9 @@
   %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
-  %res3 = add <8 x i32> %3, %6
-  ret <8 x i32> %res3
+  %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
+  %res2 = insertvalue { <8 x i32>, <8 x i32> }  %res1, <8 x i32> %6, 1
+  ret { <8 x i32>, <8 x i32> } %res2
 }
 
 declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
@@ -55,25 +56,25 @@
   ret <4 x i32> %1
 }
 
-define <4 x i32>@test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X86-NEXT:    vpdpbusd (%eax), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x18]
-; X86-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbusd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x00]
+; X86-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X64-NEXT:    vpdpbusd (%rdi), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x1f]
-; X64-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xc2]
-; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusd (%rdi), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x07]
+; X64-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, <4 x i32>* %x2p
   %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
@@ -84,8 +85,9 @@
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
-  %res3 = add <4 x i32> %3, %6
-  ret <4 x i32> %res3
+  %res1 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %3, 0
+  %res2 = insertvalue { <4 x i32>, <4 x i32> }  %res1, <4 x i32> %6, 1
+  ret { <4 x i32>, <4 x i32> } %res2
 }
 
 declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
@@ -99,25 +101,25 @@
   ret <8 x i32> %1
 }
 
-define <8 x i32>@test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X86-NEXT:    vpdpbusds (%eax), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x18]
-; X86-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbusds (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x00]
+; X86-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X64-NEXT:    vpdpbusds (%rdi), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x1f]
-; X64-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xc2]
-; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusds (%rdi), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x07]
+; X64-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, <8 x i32>* %x2p
   %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
@@ -126,8 +128,9 @@
   %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
-  %res3 = add <8 x i32> %3, %6
-  ret <8 x i32> %res3
+  %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
+  %res2 = insertvalue { <8 x i32>, <8 x i32> }  %res1, <8 x i32> %6, 1
+  ret { <8 x i32>, <8 x i32> } %res2
 }
 
 declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
@@ -141,25 +144,25 @@
   ret <4 x i32> %1
 }
 
-define <4 x i32>@test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X86-NEXT:    vpdpbusds (%eax), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x18]
-; X86-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbusds (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x00]
+; X86-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X64-NEXT:    vpdpbusds (%rdi), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x1f]
-; X64-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xc2]
-; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusds (%rdi), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x07]
+; X64-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, <4 x i32>* %x2p
   %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
@@ -170,8 +173,9 @@
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
-  %res3 = add <4 x i32> %3, %6
-  ret <4 x i32> %res3
+  %res1 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %3, 0
+  %res2 = insertvalue { <4 x i32>, <4 x i32> }  %res1, <4 x i32> %6, 1
+  ret { <4 x i32>, <4 x i32> } %res2
 }
 
 declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
@@ -185,25 +189,25 @@
   ret <8 x i32> %1
 }
 
-define <8 x i32>@test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X86-NEXT:    vpdpwssd (%eax), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x18]
-; X86-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpwssd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x00]
+; X86-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X64-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x1f]
-; X64-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xc2]
-; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x07]
+; X64-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, <8 x i32>* %x2p
   %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
@@ -212,8 +216,9 @@
   %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
-  %res3 = add <8 x i32> %3, %6
-  ret <8 x i32> %res3
+  %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
+  %res2 = insertvalue { <8 x i32>, <8 x i32> }  %res1, <8 x i32> %6, 1
+  ret { <8 x i32>, <8 x i32> } %res2
 }
 
 declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
@@ -227,25 +232,25 @@
   ret <4 x i32> %1
 }
 
-define <4 x i32>@test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X86-NEXT:    vpdpwssd (%eax), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x18]
-; X86-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpwssd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x00]
+; X86-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X64-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x1f]
-; X64-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xc2]
-; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x07]
+; X64-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, <4 x i32>* %x2p
   %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
@@ -256,8 +261,9 @@
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
-  %res3 = add <4 x i32> %3, %6
-  ret <4 x i32> %res3
+  %res1 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %3, 0
+  %res2 = insertvalue { <4 x i32>, <4 x i32> }  %res1, <4 x i32> %6, 1
+  ret { <4 x i32>, <4 x i32> } %res2
 }
 
 declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
@@ -271,25 +277,25 @@
   ret <8 x i32> %1
 }
 
-define <8 x i32>@test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X86-NEXT:    vpdpwssds (%eax), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x18]
-; X86-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpwssds (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x00]
+; X86-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; X64-NEXT:    vpdpwssds (%rdi), %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x1f]
-; X64-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xc2]
-; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssds (%rdi), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x07]
+; X64-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
+; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, <8 x i32>* %x2p
   %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
@@ -298,8 +304,9 @@
   %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
-  %res3 = add <8 x i32> %3, %6
-  ret <8 x i32> %res3
+  %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
+  %res2 = insertvalue { <8 x i32>, <8 x i32> }  %res1, <8 x i32> %6, 1
+  ret { <8 x i32>, <8 x i32> } %res2
 }
 
 declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
@@ -320,25 +327,25 @@
   ret <4 x i32> %1
 }
 
-define <4 x i32>@test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
-; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X86-NEXT:    vpdpwssds (%eax), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x18]
-; X86-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpwssds (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x00]
+; X86-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; X64-NEXT:    vpdpwssds (%rdi), %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x1f]
-; X64-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xc2]
-; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssds (%rdi), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x07]
+; X64-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
+; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, <4 x i32>* %x2p
   %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
@@ -349,6 +356,7 @@
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
-  %res3 = add <4 x i32> %3, %6
-  ret <4 x i32> %res3
+  %res1 = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> %3, 0
+  %res2 = insertvalue { <4 x i32>, <4 x i32> }  %res1, <4 x i32> %6, 1
+  ret { <4 x i32>, <4 x i32> } %res2
 }
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
@@ -14,30 +14,31 @@
   ret <16 x i32> %res
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86-NEXT:    vpdpbusd (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x18]
-; X86-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xc2]
-; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpdpbusd (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x00]
+; X86-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xda]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X64-NEXT:    vpdpbusd (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x1f]
-; X64-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xc2]
-; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusd (%rdi), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x07]
+; X64-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xda]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16  %x3)
-  %res2 = add <16 x i32> %res, %res1
-  ret <16 x i32> %res2
+  %res2 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res3 = insertvalue { <16 x i32>, <16 x i32> }  %res2, <16 x i32> %res1, 1
+  ret { <16 x i32>, <16 x i32> } %res3
 }
 
 declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
@@ -52,30 +53,31 @@
   ret <16 x i32> %res
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86-NEXT:    vpdpbusds (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x18]
-; X86-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xc2]
-; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpdpbusds (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x00]
+; X86-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xda]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X64-NEXT:    vpdpbusds (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x1f]
-; X64-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xc2]
-; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusds (%rdi), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x07]
+; X64-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xda]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16  %x3)
-  %res2 = add <16 x i32> %res, %res1
-  ret <16 x i32> %res2
+  %res2 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res3 = insertvalue { <16 x i32>, <16 x i32> }  %res2, <16 x i32> %res1, 1
+  ret { <16 x i32>, <16 x i32> } %res3
 }
 
 declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
@@ -90,30 +92,31 @@
   ret <16 x i32> %res
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86-NEXT:    vpdpwssd (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x18]
-; X86-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xc2]
-; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpdpwssd (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x00]
+; X86-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X64-NEXT:    vpdpwssd (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x1f]
-; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xc2]
-; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssd (%rdi), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x07]
+; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16  %x3)
-  %res2 = add <16 x i32> %res, %res1
-  ret <16 x i32> %res2
+  %res2 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res3 = insertvalue { <16 x i32>, <16 x i32> }  %res2, <16 x i32> %res1, 1
+  ret { <16 x i32>, <16 x i32> } %res3
 }
 
 declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
@@ -128,29 +131,30 @@
   ret <16 x i32> %res
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86-NEXT:    vpdpwssds (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x18]
-; X86-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xc2]
-; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpdpwssds (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x00]
+; X86-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X64-NEXT:    vpdpwssds (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x1f]
-; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xc2]
-; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssds (%rdi), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x07]
+; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16  %x3)
-  %res2 = add <16 x i32> %res, %res1
-  ret <16 x i32> %res2
+  %res2 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res3 = insertvalue { <16 x i32>, <16 x i32> }  %res2, <16 x i32> %res1, 1
+  ret { <16 x i32>, <16 x i32> } %res3
 }
 
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -4,7 +4,7 @@
 
 declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
 
-define <16 x i32>@test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ask_vpdpbusd_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x50,0xc2]
@@ -13,24 +13,24 @@
   ret <16 x i32> %1
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86-NEXT:    vpdpbusd (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x18]
-; X86-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xc2]
-; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpdpbusd (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x00]
+; X86-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xda]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X64-NEXT:    vpdpbusd (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x1f]
-; X64-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xc2]
-; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusd (%rdi), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x50,0x07]
+; X64-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xda]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
   %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
@@ -39,8 +39,9 @@
   %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
-  %res3 = add <16 x i32> %3, %6
-  ret <16 x i32> %res3
+  %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
+  %res2 = insertvalue { <16 x i32>, <16 x i32> }  %res1, <16 x i32> %6, 1
+  ret { <16 x i32>, <16 x i32> } %res2
 }
 
 declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
@@ -54,24 +55,24 @@
   ret <16 x i32> %1
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86-NEXT:    vpdpbusds (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x18]
-; X86-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xc2]
-; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpdpbusds (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x00]
+; X86-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xda]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X64-NEXT:    vpdpbusds (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x1f]
-; X64-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xc2]
-; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpbusds (%rdi), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x51,0x07]
+; X64-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xda]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
   %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
@@ -80,8 +81,9 @@
   %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
-  %res3 = add <16 x i32> %3, %6
-  ret <16 x i32> %res3
+  %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
+  %res2 = insertvalue { <16 x i32>, <16 x i32> }  %res1, <16 x i32> %6, 1
+  ret { <16 x i32>, <16 x i32> } %res2
 }
 
 declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
@@ -95,24 +97,24 @@
   ret <16 x i32> %1
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86-NEXT:    vpdpwssd (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x18]
-; X86-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xc2]
-; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpdpwssd (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x00]
+; X86-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X64-NEXT:    vpdpwssd (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x1f]
-; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xc2]
-; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssd (%rdi), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0x07]
+; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
   %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
@@ -121,8 +123,9 @@
   %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
-  %res3 = add <16 x i32> %3, %6
-  ret <16 x i32> %res3
+  %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
+  %res2 = insertvalue { <16 x i32>, <16 x i32> }  %res1, <16 x i32> %6, 1
+  ret { <16 x i32>, <16 x i32> } %res2
 }
 
 declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
@@ -136,24 +139,24 @@
   ret <16 x i32> %1
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86-NEXT:    vpdpwssds (%eax), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x18]
-; X86-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xc2]
-; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpdpwssds (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x00]
+; X86-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X64-NEXT:    vpdpwssds (%rdi), %zmm1, %zmm3 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x1f]
-; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xc2]
-; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpdpwssds (%rdi), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0x07]
+; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
+; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
   %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
@@ -162,6 +165,7 @@
   %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
-  %res3 = add <16 x i32> %3, %6
-  ret <16 x i32> %res3
+  %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
+  %res2 = insertvalue { <16 x i32>, <16 x i32> }  %res1, <16 x i32> %6, 1
+  ret { <16 x i32>, <16 x i32> } %res2
 }
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.5a-specrestrict.txt b/llvm/test/MC/Disassembler/AArch64/armv8.5a-specrestrict.txt
--- a/llvm/test/MC/Disassembler/AArch64/armv8.5a-specrestrict.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.5a-specrestrict.txt
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=aarch64 -mattr=+specrestrict -disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple=aarch64 -mattr=+v8.5a        -disassemble < %s | FileCheck %s
-# RUN: llvm-mc -triple=aarch64 -mattr=+v8r -disassemble < %s | FileCheck %s --check-prefix=NOSPECID
+# RUN: llvm-mc -triple=aarch64 -mattr=+v8r -disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple=aarch64 -mattr=-specrestrict -disassemble < %s | FileCheck %s --check-prefix=NOSPECID
 
 [0x81 0x03 0x38 0xd5]
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
@@ -28,7 +28,7 @@
 ; IS__CGSCC_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[X_PRIV]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[X_PRIV]], align 4
-; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP0]]
+; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %tmp2 = load i32, i32* %x, align 4
@@ -76,7 +76,7 @@
 ;.
 ; IS__CGSCC_OPM: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind readonly willreturn }
 ; IS__CGSCC_OPM: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__CGSCC_OPM: attributes #[[ATTR2]] = { nounwind readonly willreturn }
+; IS__CGSCC_OPM: attributes #[[ATTR2]] = { nosync nounwind readonly willreturn }
 ;.
 ; IS__CGSCC_NPM: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind readnone willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
@@ -104,25 +104,15 @@
 ; IS________OPM-NEXT:    store <4 x i64> [[TMP]], <4 x i64>* [[ARG]], align 32
 ; IS________OPM-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@promote_avx2
-; IS__TUNIT_NPM-SAME: (<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; IS__TUNIT_NPM-NEXT:  bb:
-; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <4 x i64>, align 32
-; IS__TUNIT_NPM-NEXT:    store <4 x i64> [[TMP0]], <4 x i64>* [[ARG1_PRIV]], align 32
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <4 x i64>, <4 x i64>* [[ARG1_PRIV]], align 32
-; IS__TUNIT_NPM-NEXT:    store <4 x i64> [[TMP]], <4 x i64>* [[ARG]], align 32
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@promote_avx2
-; IS__CGSCC_NPM-SAME: (<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; IS__CGSCC_NPM-NEXT:  bb:
-; IS__CGSCC_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <4 x i64>, align 32
-; IS__CGSCC_NPM-NEXT:    store <4 x i64> [[TMP0]], <4 x i64>* [[ARG1_PRIV]], align 32
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = load <4 x i64>, <4 x i64>* [[ARG1_PRIV]], align 32
-; IS__CGSCC_NPM-NEXT:    store <4 x i64> [[TMP0]], <4 x i64>* [[ARG]], align 32
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS________NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
+; IS________NPM-LABEL: define {{[^@]+}}@promote_avx2
+; IS________NPM-SAME: (<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; IS________NPM-NEXT:  bb:
+; IS________NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <4 x i64>, align 32
+; IS________NPM-NEXT:    store <4 x i64> [[TMP0]], <4 x i64>* [[ARG1_PRIV]], align 32
+; IS________NPM-NEXT:    [[TMP:%.*]] = load <4 x i64>, <4 x i64>* [[ARG1_PRIV]], align 32
+; IS________NPM-NEXT:    store <4 x i64> [[TMP]], <4 x i64>* [[ARG]], align 32
+; IS________NPM-NEXT:    ret void
 ;
 bb:
   %tmp = load <4 x i64>, <4 x i64>* %arg1
@@ -181,8 +171,7 @@
 ; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR3]]
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load <4 x i64>, <4 x i64>* [[TMP]], align 32
 ; IS__CGSCC_NPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64> [[TMP0]]) #[[ATTR4]]
-; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
-; IS__CGSCC_NPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
+; IS__CGSCC_NPM-NEXT:    store <4 x i64> [[TMP0]], <4 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
@@ -19,25 +19,15 @@
 ; IS________OPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; IS________OPM-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; IS__TUNIT_NPM-NEXT:  bb:
-; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
-; IS__CGSCC_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; IS__CGSCC_NPM-NEXT:  bb:
-; IS__CGSCC_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 64
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS________NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
+; IS________NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
+; IS________NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; IS________NPM-NEXT:  bb:
+; IS________NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
+; IS________NPM-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, <8 x i64>* %arg1
@@ -97,8 +87,7 @@
 ; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR6:[0-9]+]]
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
 ; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR7:[0-9]+]]
-; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
+; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 bb:
@@ -123,25 +112,15 @@
 ; IS________OPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; IS________OPM-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
-; IS__TUNIT_NPM-NEXT:  bb:
-; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
-; IS__CGSCC_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
-; IS__CGSCC_NPM-NEXT:  bb:
-; IS__CGSCC_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 64
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS________NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
+; IS________NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
+; IS________NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; IS________NPM-NEXT:  bb:
+; IS________NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
+; IS________NPM-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, <8 x i64>* %arg1
@@ -201,8 +180,7 @@
 ; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR6]]
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
 ; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR7]]
-; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
+; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 bb:
@@ -227,25 +205,15 @@
 ; IS________OPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; IS________OPM-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
-; IS__TUNIT_NPM-NEXT:  bb:
-; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
-; IS__CGSCC_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
-; IS__CGSCC_NPM-NEXT:  bb:
-; IS__CGSCC_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 64
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS________NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
+; IS________NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
+; IS________NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
+; IS________NPM-NEXT:  bb:
+; IS________NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
+; IS________NPM-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, <8 x i64>* %arg1
@@ -305,8 +273,7 @@
 ; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR6]]
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
 ; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR7]]
-; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
+; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 bb:
@@ -331,25 +298,15 @@
 ; IS________OPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; IS________OPM-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; IS__TUNIT_NPM-NEXT:  bb:
-; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
-; IS__CGSCC_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; IS__CGSCC_NPM-NEXT:  bb:
-; IS__CGSCC_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 64
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS________NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
+; IS________NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
+; IS________NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; IS________NPM-NEXT:  bb:
+; IS________NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
+; IS________NPM-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, <8 x i64>* %arg1
@@ -409,8 +366,7 @@
 ; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR6]]
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
 ; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR7]]
-; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
+; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 bb:
@@ -437,7 +393,7 @@
 ;
 ; IS________NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
 ; IS________NPM-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
-; IS________NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1:[0-9]+]] {
+; IS________NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
 ; IS________NPM-NEXT:  bb:
 ; IS________NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; IS________NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
@@ -615,25 +571,15 @@
 ; IS________OPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; IS________OPM-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; IS__TUNIT_NPM-NEXT:  bb:
-; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
-; IS__CGSCC_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; IS__CGSCC_NPM-NEXT:  bb:
-; IS__CGSCC_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 64
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS________NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
+; IS________NPM-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
+; IS________NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
+; IS________NPM-NEXT:  bb:
+; IS________NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
+; IS________NPM-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, <8 x i64>* %arg1
@@ -693,8 +639,7 @@
 ; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR6]]
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
 ; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR7]]
-; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
+; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 bb:
@@ -719,25 +664,15 @@
 ; IS________OPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; IS________OPM-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR4]] {
-; IS__TUNIT_NPM-NEXT:  bb:
-; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
-; IS__CGSCC_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR4]] {
-; IS__CGSCC_NPM-NEXT:  bb:
-; IS__CGSCC_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 64
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS________NPM: Function Attrs: argmemonly inlinehint nofree norecurse nosync nounwind willreturn uwtable
+; IS________NPM-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
+; IS________NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] {
+; IS________NPM-NEXT:  bb:
+; IS________NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1_PRIV]], align 64
+; IS________NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
+; IS________NPM-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, <8 x i64>* %arg1
@@ -797,8 +732,7 @@
 ; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR6]]
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
 ; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR7]]
-; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
-; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
+; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
@@ -85,5 +85,5 @@
 ;.
 ; IS__CGSCC____: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind readonly willreturn }
 ; IS__CGSCC____: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__CGSCC____: attributes #[[ATTR2]] = { nounwind readonly willreturn }
+; IS__CGSCC____: attributes #[[ATTR2]] = { nosync nounwind readonly willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
@@ -78,5 +78,5 @@
 ; IS__CGSCC____: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind willreturn writeonly }
 ; IS__CGSCC____: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readnone willreturn }
 ; IS__CGSCC____: attributes #[[ATTR2]] = { nofree nosync nounwind willreturn writeonly }
-; IS__CGSCC____: attributes #[[ATTR3]] = { nounwind willreturn writeonly }
+; IS__CGSCC____: attributes #[[ATTR3]] = { nosync nounwind willreturn writeonly }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
@@ -35,7 +35,7 @@
 ; Function Attrs: nounwind uwtable
 define internal void @callee_t0f(i8* nocapture readnone %tp13, i8* nocapture readnone %tp14, i8* nocapture readnone %tp15, i8* nocapture readnone %tp16, i8* nocapture readnone %tp17, ...) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_t0f
-; CHECK-SAME: (i8* noalias nocapture nofree nonnull readnone [[TP13:%.*]], i8* noalias nocapture nofree nonnull readnone [[TP14:%.*]], i8* noalias nocapture nofree nonnull readnone [[TP15:%.*]], i8* noalias nocapture nofree nonnull readnone [[TP16:%.*]], i8* noalias nocapture nofree nonnull readnone [[TP17:%.*]], ...) {
+; CHECK-SAME: (i8* noalias nocapture nofree nonnull readnone align 4294967296 [[TP13:%.*]], i8* noalias nocapture nofree nonnull readnone align 4294967296 [[TP14:%.*]], i8* noalias nocapture nofree nonnull readnone align 4294967296 [[TP15:%.*]], i8* noalias nocapture nofree nonnull readnone align 4294967296 [[TP16:%.*]], i8* noalias nocapture nofree nonnull readnone align 4294967296 [[TP17:%.*]], ...) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @sink(i32 noundef 0)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
--- a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -120,8 +120,8 @@
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 8
-; IS__CGSCC_NPM-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP0]] to i32
-; IS__CGSCC_NPM-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP1]]
+; IS__CGSCC_NPM-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; IS__CGSCC_NPM-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP3]]
 ; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP7]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/PR16052.ll b/llvm/test/Transforms/Attributor/IPConstantProp/PR16052.ll
--- a/llvm/test/Transforms/Attributor/IPConstantProp/PR16052.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/PR16052.ll
@@ -32,22 +32,13 @@
 
 define i64 @fn2b(i32 %arg) {
 ;
-; IS________OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS________OPM-LABEL: define {{[^@]+}}@fn2b
-; IS________OPM-SAME: (i32 [[ARG:%.*]]) #[[ATTR0]] {
-; IS________OPM-NEXT:  entry:
-; IS________OPM-NEXT:    [[CONV:%.*]] = sext i32 [[ARG]] to i64
-; IS________OPM-NEXT:    [[DIV:%.*]] = sdiv i64 8, [[CONV]]
-; IS________OPM-NEXT:    [[CALL2:%.*]] = call i64 @fn1(i64 [[DIV]]) #[[ATTR1:[0-9]+]]
-; IS________OPM-NEXT:    ret i64 [[CALL2]]
-;
-; IS________NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS________NPM-LABEL: define {{[^@]+}}@fn2b
-; IS________NPM-SAME: (i32 [[ARG:%.*]]) #[[ATTR0]] {
-; IS________NPM-NEXT:  entry:
-; IS________NPM-NEXT:    [[CONV:%.*]] = sext i32 [[ARG]] to i64
-; IS________NPM-NEXT:    [[DIV:%.*]] = sdiv i64 8, [[CONV]]
-; IS________NPM-NEXT:    ret i64 [[DIV]]
+; CHECK: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; CHECK-LABEL: define {{[^@]+}}@fn2b
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[ARG]] to i64
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i64 8, [[CONV]]
+; CHECK-NEXT:    ret i64 [[DIV]]
 ;
 entry:
   %conv = sext i32 %arg to i64
@@ -79,17 +70,11 @@
 }
 
 define internal i64 @fn1(i64 %p1) {
-; IS________OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS________OPM-LABEL: define {{[^@]+}}@fn1
-; IS________OPM-SAME: (i64 returned [[P1:%.*]]) #[[ATTR0]] {
-; IS________OPM-NEXT:  entry:
-; IS________OPM-NEXT:    ret i64 [[P1]]
-;
-; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@fn1
-; IS__CGSCC_NPM-SAME: (i64 [[P1:%.*]]) #[[ATTR0]] {
-; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    ret i64 undef
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@fn1
+; IS__CGSCC____-SAME: (i64 [[P1:%.*]]) #[[ATTR0]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    ret i64 undef
 ;
 entry:
   %tobool = icmp ne i64 %p1, 0
@@ -97,11 +82,5 @@
   ret i64 %cond
 }
 ;.
-; IS__TUNIT_OPM: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__TUNIT_OPM: attributes #[[ATTR1]] = { nofree nosync nounwind readnone willreturn }
-;.
-; IS________NPM: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind readnone willreturn }
-;.
-; IS__CGSCC_OPM: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__CGSCC_OPM: attributes #[[ATTR1]] = { readnone willreturn }
+; CHECK: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind readnone willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll b/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll
--- a/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/PR26044.ll
@@ -8,67 +8,35 @@
 
 define void @fn2(i32* %P, i1 %C) {
 ;
-; IS__TUNIT_OPM: Function Attrs: argmemonly nofree norecurse nosync nounwind
-; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@fn2
-; IS__TUNIT_OPM-SAME: (i32* nocapture nofree [[P:%.*]], i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
-; IS__TUNIT_OPM-NEXT:  entry:
-; IS__TUNIT_OPM-NEXT:    br label [[IF_END:%.*]]
-; IS__TUNIT_OPM:       for.cond1:
-; IS__TUNIT_OPM-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
-; IS__TUNIT_OPM:       if.end:
-; IS__TUNIT_OPM-NEXT:    [[E_2:%.*]] = phi i32* [ [[P]], [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
-; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[E_2]], align 4
-; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32 @fn1(i32 [[TMP0]]) #[[ATTR3:[0-9]+]]
-; IS__TUNIT_OPM-NEXT:    store i32 [[CALL]], i32* [[P]], align 4
-; IS__TUNIT_OPM-NEXT:    br label [[FOR_COND1]]
-; IS__TUNIT_OPM:       exit:
-; IS__TUNIT_OPM-NEXT:    ret void
+; IS__TUNIT____: Function Attrs: argmemonly nofree norecurse nosync nounwind
+; IS__TUNIT____-LABEL: define {{[^@]+}}@fn2
+; IS__TUNIT____-SAME: (i32* nocapture nofree [[P:%.*]], i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; IS__TUNIT____-NEXT:  entry:
+; IS__TUNIT____-NEXT:    br label [[IF_END:%.*]]
+; IS__TUNIT____:       for.cond1:
+; IS__TUNIT____-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
+; IS__TUNIT____:       if.end:
+; IS__TUNIT____-NEXT:    [[E_2:%.*]] = phi i32* [ [[P]], [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
+; IS__TUNIT____-NEXT:    [[TMP0:%.*]] = load i32, i32* [[E_2]], align 4
+; IS__TUNIT____-NEXT:    store i32 [[TMP0]], i32* [[P]], align 4
+; IS__TUNIT____-NEXT:    br label [[FOR_COND1]]
+; IS__TUNIT____:       exit:
+; IS__TUNIT____-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@fn2
-; IS__TUNIT_NPM-SAME: (i32* nocapture nofree [[P:%.*]], i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
-; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    br label [[IF_END:%.*]]
-; IS__TUNIT_NPM:       for.cond1:
-; IS__TUNIT_NPM-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
-; IS__TUNIT_NPM:       if.end:
-; IS__TUNIT_NPM-NEXT:    [[E_2:%.*]] = phi i32* [ [[P]], [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
-; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[E_2]], align 4
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP0]], i32* [[P]], align 4
-; IS__TUNIT_NPM-NEXT:    br label [[FOR_COND1]]
-; IS__TUNIT_NPM:       exit:
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_OPM: Function Attrs: argmemonly nofree norecurse nosync nounwind
-; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@fn2
-; IS__CGSCC_OPM-SAME: (i32* nocapture nofree nonnull align 4 dereferenceable(4) [[P:%.*]], i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
-; IS__CGSCC_OPM-NEXT:  entry:
-; IS__CGSCC_OPM-NEXT:    br label [[IF_END:%.*]]
-; IS__CGSCC_OPM:       for.cond1:
-; IS__CGSCC_OPM-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
-; IS__CGSCC_OPM:       if.end:
-; IS__CGSCC_OPM-NEXT:    [[E_2:%.*]] = phi i32* [ [[P]], [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
-; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[E_2]], align 4
-; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call i32 @fn1(i32 [[TMP0]])
-; IS__CGSCC_OPM-NEXT:    store i32 [[CALL]], i32* [[P]], align 4
-; IS__CGSCC_OPM-NEXT:    br label [[FOR_COND1]]
-; IS__CGSCC_OPM:       exit:
-; IS__CGSCC_OPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@fn2
-; IS__CGSCC_NPM-SAME: (i32* nocapture nofree nonnull align 4 dereferenceable(4) [[P:%.*]], i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
-; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    br label [[IF_END:%.*]]
-; IS__CGSCC_NPM:       for.cond1:
-; IS__CGSCC_NPM-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
-; IS__CGSCC_NPM:       if.end:
-; IS__CGSCC_NPM-NEXT:    [[E_2:%.*]] = phi i32* [ [[P]], [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
-; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[E_2]], align 4
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[P]], align 4
-; IS__CGSCC_NPM-NEXT:    br label [[FOR_COND1]]
-; IS__CGSCC_NPM:       exit:
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind
+; IS__CGSCC____-LABEL: define {{[^@]+}}@fn2
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull align 4 dereferenceable(4) [[P:%.*]], i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    br label [[IF_END:%.*]]
+; IS__CGSCC____:       for.cond1:
+; IS__CGSCC____-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
+; IS__CGSCC____:       if.end:
+; IS__CGSCC____-NEXT:    [[E_2:%.*]] = phi i32* [ [[P]], [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
+; IS__CGSCC____-NEXT:    [[TMP0:%.*]] = load i32, i32* [[E_2]], align 4
+; IS__CGSCC____-NEXT:    store i32 [[TMP0]], i32* [[P]], align 4
+; IS__CGSCC____-NEXT:    br label [[FOR_COND1]]
+; IS__CGSCC____:       exit:
+; IS__CGSCC____-NEXT:    ret void
 ;
 entry:
   br label %if.end
@@ -87,17 +55,11 @@
 }
 
 define internal i32 @fn1(i32 %p1) {
-; IS________OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS________OPM-LABEL: define {{[^@]+}}@fn1
-; IS________OPM-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1:[0-9]+]] {
-; IS________OPM-NEXT:  entry:
-; IS________OPM-NEXT:    ret i32 [[P1]]
-;
-; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@fn1
-; IS__CGSCC_NPM-SAME: (i32 [[P1:%.*]]) #[[ATTR1:[0-9]+]] {
-; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    ret i32 undef
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@fn1
+; IS__CGSCC____-SAME: (i32 [[P1:%.*]]) #[[ATTR1:[0-9]+]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    ret i32 undef
 ;
 entry:
   %tobool = icmp ne i32 %p1, 0
@@ -107,67 +69,35 @@
 
 define void @fn_no_null_opt(i32* %P, i1 %C) null_pointer_is_valid {
 ;
-; IS__TUNIT_OPM: Function Attrs: nofree norecurse nosync nounwind null_pointer_is_valid
-; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@fn_no_null_opt
-; IS__TUNIT_OPM-SAME: (i32* nocapture nofree writeonly [[P:%.*]], i1 [[C:%.*]]) #[[ATTR2:[0-9]+]] {
-; IS__TUNIT_OPM-NEXT:  entry:
-; IS__TUNIT_OPM-NEXT:    br label [[IF_END:%.*]]
-; IS__TUNIT_OPM:       for.cond1:
-; IS__TUNIT_OPM-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
-; IS__TUNIT_OPM:       if.end:
-; IS__TUNIT_OPM-NEXT:    [[E_2:%.*]] = phi i32* [ undef, [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
-; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* null, align 4
-; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32 @fn0(i32 [[TMP0]]) #[[ATTR3]]
-; IS__TUNIT_OPM-NEXT:    store i32 [[CALL]], i32* [[P]], align 4
-; IS__TUNIT_OPM-NEXT:    br label [[FOR_COND1]]
-; IS__TUNIT_OPM:       exit:
-; IS__TUNIT_OPM-NEXT:    ret void
-;
-; IS__TUNIT_NPM: Function Attrs: nofree norecurse nosync nounwind null_pointer_is_valid
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@fn_no_null_opt
-; IS__TUNIT_NPM-SAME: (i32* nocapture nofree writeonly [[P:%.*]], i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
-; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    br label [[IF_END:%.*]]
-; IS__TUNIT_NPM:       for.cond1:
-; IS__TUNIT_NPM-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
-; IS__TUNIT_NPM:       if.end:
-; IS__TUNIT_NPM-NEXT:    [[E_2:%.*]] = phi i32* [ undef, [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
-; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* null, align 4
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP0]], i32* [[P]], align 4
-; IS__TUNIT_NPM-NEXT:    br label [[FOR_COND1]]
-; IS__TUNIT_NPM:       exit:
-; IS__TUNIT_NPM-NEXT:    ret void
+; IS__TUNIT____: Function Attrs: nofree norecurse nosync nounwind null_pointer_is_valid
+; IS__TUNIT____-LABEL: define {{[^@]+}}@fn_no_null_opt
+; IS__TUNIT____-SAME: (i32* nocapture nofree writeonly [[P:%.*]], i1 [[C:%.*]]) #[[ATTR1:[0-9]+]] {
+; IS__TUNIT____-NEXT:  entry:
+; IS__TUNIT____-NEXT:    br label [[IF_END:%.*]]
+; IS__TUNIT____:       for.cond1:
+; IS__TUNIT____-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
+; IS__TUNIT____:       if.end:
+; IS__TUNIT____-NEXT:    [[E_2:%.*]] = phi i32* [ undef, [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
+; IS__TUNIT____-NEXT:    [[TMP0:%.*]] = load i32, i32* null, align 4294967296
+; IS__TUNIT____-NEXT:    store i32 [[TMP0]], i32* [[P]], align 4
+; IS__TUNIT____-NEXT:    br label [[FOR_COND1]]
+; IS__TUNIT____:       exit:
+; IS__TUNIT____-NEXT:    ret void
 ;
-; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind null_pointer_is_valid
-; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@fn_no_null_opt
-; IS__CGSCC_OPM-SAME: (i32* nocapture nofree writeonly align 4 dereferenceable_or_null(4) [[P:%.*]], i1 [[C:%.*]]) #[[ATTR2:[0-9]+]] {
-; IS__CGSCC_OPM-NEXT:  entry:
-; IS__CGSCC_OPM-NEXT:    br label [[IF_END:%.*]]
-; IS__CGSCC_OPM:       for.cond1:
-; IS__CGSCC_OPM-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
-; IS__CGSCC_OPM:       if.end:
-; IS__CGSCC_OPM-NEXT:    [[E_2:%.*]] = phi i32* [ undef, [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
-; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* null, align 4294967296
-; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call i32 @fn0(i32 [[TMP0]])
-; IS__CGSCC_OPM-NEXT:    store i32 [[CALL]], i32* [[P]], align 4
-; IS__CGSCC_OPM-NEXT:    br label [[FOR_COND1]]
-; IS__CGSCC_OPM:       exit:
-; IS__CGSCC_OPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind null_pointer_is_valid
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@fn_no_null_opt
-; IS__CGSCC_NPM-SAME: (i32* nocapture nofree writeonly align 4 dereferenceable_or_null(4) [[P:%.*]], i1 [[C:%.*]]) #[[ATTR2:[0-9]+]] {
-; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    br label [[IF_END:%.*]]
-; IS__CGSCC_NPM:       for.cond1:
-; IS__CGSCC_NPM-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
-; IS__CGSCC_NPM:       if.end:
-; IS__CGSCC_NPM-NEXT:    [[E_2:%.*]] = phi i32* [ undef, [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
-; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* null, align 4294967296
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[P]], align 4
-; IS__CGSCC_NPM-NEXT:    br label [[FOR_COND1]]
-; IS__CGSCC_NPM:       exit:
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind null_pointer_is_valid
+; IS__CGSCC____-LABEL: define {{[^@]+}}@fn_no_null_opt
+; IS__CGSCC____-SAME: (i32* nocapture nofree writeonly align 4 dereferenceable_or_null(4) [[P:%.*]], i1 [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    br label [[IF_END:%.*]]
+; IS__CGSCC____:       for.cond1:
+; IS__CGSCC____-NEXT:    br i1 [[C]], label [[IF_END]], label [[EXIT:%.*]]
+; IS__CGSCC____:       if.end:
+; IS__CGSCC____-NEXT:    [[E_2:%.*]] = phi i32* [ undef, [[ENTRY:%.*]] ], [ null, [[FOR_COND1:%.*]] ]
+; IS__CGSCC____-NEXT:    [[TMP0:%.*]] = load i32, i32* null, align 4294967296
+; IS__CGSCC____-NEXT:    store i32 [[TMP0]], i32* [[P]], align 4
+; IS__CGSCC____-NEXT:    br label [[FOR_COND1]]
+; IS__CGSCC____:       exit:
+; IS__CGSCC____-NEXT:    ret void
 ;
 entry:
   br label %if.end
@@ -186,17 +116,11 @@
 }
 
 define internal i32 @fn0(i32 %p1) {
-; IS________OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS________OPM-LABEL: define {{[^@]+}}@fn0
-; IS________OPM-SAME: (i32 returned [[P1:%.*]]) #[[ATTR1]] {
-; IS________OPM-NEXT:  entry:
-; IS________OPM-NEXT:    ret i32 [[P1]]
-;
-; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@fn0
-; IS__CGSCC_NPM-SAME: (i32 [[P1:%.*]]) #[[ATTR1]] {
-; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    ret i32 undef
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@fn0
+; IS__CGSCC____-SAME: (i32 [[P1:%.*]]) #[[ATTR1]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    ret i32 undef
 ;
 entry:
   %tobool = icmp ne i32 %p1, 0
@@ -204,15 +128,10 @@
   ret i32 %cond
 }
 ;.
-; IS__TUNIT_OPM: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind }
-; IS__TUNIT_OPM: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__TUNIT_OPM: attributes #[[ATTR2]] = { nofree norecurse nosync nounwind null_pointer_is_valid }
-; IS__TUNIT_OPM: attributes #[[ATTR3]] = { nofree nosync nounwind readnone }
-;.
-; IS__TUNIT_NPM: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind }
-; IS__TUNIT_NPM: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind null_pointer_is_valid }
+; IS__TUNIT____: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind }
+; IS__TUNIT____: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind null_pointer_is_valid }
 ;.
-; IS__CGSCC____: attributes #[[ATTR0:[0-9]+]] = { argmemonly nofree norecurse nosync nounwind }
-; IS__CGSCC____: attributes #[[ATTR1:[0-9]+]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__CGSCC____: attributes #[[ATTR2:[0-9]+]] = { nofree norecurse nosync nounwind null_pointer_is_valid }
+; IS__CGSCC____: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind }
+; IS__CGSCC____: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readnone willreturn }
+; IS__CGSCC____: attributes #[[ATTR2]] = { nofree norecurse nosync nounwind null_pointer_is_valid }
 ;.
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll b/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
--- a/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
-; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=15 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=15 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
+; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=14 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=14 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
 ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
 ;
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-argument.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-argument.ll
--- a/llvm/test/Transforms/Attributor/IPConstantProp/return-argument.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-argument.ll
@@ -6,33 +6,33 @@
 
 ;; This function returns its second argument on all return statements
 define internal i32* @incdec(i1 %C, i32* %V) {
-; NOT_CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
-; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@incdec
-; NOT_CGSCC_NPM-SAME: (i1 [[C:%.*]], i32* noalias nofree noundef nonnull returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[V:%.*]]) #[[ATTR0:[0-9]+]] {
-; NOT_CGSCC_NPM-NEXT:    [[X:%.*]] = load i32, i32* [[V]], align 4
-; NOT_CGSCC_NPM-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
-; NOT_CGSCC_NPM:       T:
-; NOT_CGSCC_NPM-NEXT:    [[X1:%.*]] = add i32 [[X]], 1
-; NOT_CGSCC_NPM-NEXT:    store i32 [[X1]], i32* [[V]], align 4
-; NOT_CGSCC_NPM-NEXT:    ret i32* [[V]]
-; NOT_CGSCC_NPM:       F:
-; NOT_CGSCC_NPM-NEXT:    [[X2:%.*]] = sub i32 [[X]], 1
-; NOT_CGSCC_NPM-NEXT:    store i32 [[X2]], i32* [[V]], align 4
-; NOT_CGSCC_NPM-NEXT:    ret i32* [[V]]
+; IS__TUNIT____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@incdec
+; IS__TUNIT____-SAME: (i1 [[C:%.*]], i32* noalias nofree noundef nonnull returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; IS__TUNIT____-NEXT:    [[X:%.*]] = load i32, i32* [[V]], align 4
+; IS__TUNIT____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; IS__TUNIT____:       T:
+; IS__TUNIT____-NEXT:    [[X1:%.*]] = add i32 [[X]], 1
+; IS__TUNIT____-NEXT:    store i32 [[X1]], i32* [[V]], align 4
+; IS__TUNIT____-NEXT:    ret i32* [[V]]
+; IS__TUNIT____:       F:
+; IS__TUNIT____-NEXT:    [[X2:%.*]] = sub i32 [[X]], 1
+; IS__TUNIT____-NEXT:    store i32 [[X2]], i32* [[V]], align 4
+; IS__TUNIT____-NEXT:    ret i32* [[V]]
 ;
-; IS__CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@incdec
-; IS__CGSCC_NPM-SAME: (i1 [[C:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[V:%.*]]) #[[ATTR0:[0-9]+]] {
-; IS__CGSCC_NPM-NEXT:    [[X:%.*]] = load i32, i32* [[V]], align 4
-; IS__CGSCC_NPM-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
-; IS__CGSCC_NPM:       T:
-; IS__CGSCC_NPM-NEXT:    [[X1:%.*]] = add i32 [[X]], 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[X1]], i32* [[V]], align 4
-; IS__CGSCC_NPM-NEXT:    ret i32* undef
-; IS__CGSCC_NPM:       F:
-; IS__CGSCC_NPM-NEXT:    [[X2:%.*]] = sub i32 [[X]], 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[X2]], i32* [[V]], align 4
-; IS__CGSCC_NPM-NEXT:    ret i32* undef
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@incdec
+; IS__CGSCC____-SAME: (i1 [[C:%.*]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; IS__CGSCC____-NEXT:    [[X:%.*]] = load i32, i32* [[V]], align 4
+; IS__CGSCC____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; IS__CGSCC____:       T:
+; IS__CGSCC____-NEXT:    [[X1:%.*]] = add i32 [[X]], 1
+; IS__CGSCC____-NEXT:    store i32 [[X1]], i32* [[V]], align 4
+; IS__CGSCC____-NEXT:    ret i32* undef
+; IS__CGSCC____:       F:
+; IS__CGSCC____-NEXT:    [[X2:%.*]] = sub i32 [[X]], 1
+; IS__CGSCC____-NEXT:    store i32 [[X2]], i32* [[V]], align 4
+; IS__CGSCC____-NEXT:    ret i32* undef
 ;
   %X = load i32, i32* %V
   br i1 %C, label %T, label %F
@@ -66,81 +66,43 @@
 }
 
 define void @caller(i1 %C) personality i32 (...)* @__gxx_personality_v0 {
-; IS__TUNIT_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@caller
-; IS__TUNIT_OPM-SAME: (i1 [[C:%.*]]) #[[ATTR1]] personality i32 (...)* @__gxx_personality_v0 {
-; IS__TUNIT_OPM-NEXT:    [[Q:%.*]] = alloca i32, align 4
-; IS__TUNIT_OPM-NEXT:    [[W:%.*]] = call align 4 i32* @incdec(i1 [[C]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q]]) #[[ATTR2:[0-9]+]]
-; IS__TUNIT_OPM-NEXT:    [[S1:%.*]] = call { i32, i32 } @foo(i32 noundef 1, i32 noundef 2) #[[ATTR3:[0-9]+]]
-; IS__TUNIT_OPM-NEXT:    [[X1:%.*]] = extractvalue { i32, i32 } [[S1]], 0
-; IS__TUNIT_OPM-NEXT:    [[S2:%.*]] = call { i32, i32 } @foo(i32 noundef 3, i32 noundef 4) #[[ATTR3]]
-; IS__TUNIT_OPM-NEXT:    br label [[OK:%.*]]
-; IS__TUNIT_OPM:       OK:
-; IS__TUNIT_OPM-NEXT:    [[X2:%.*]] = extractvalue { i32, i32 } [[S2]], 0
-; IS__TUNIT_OPM-NEXT:    [[Z:%.*]] = add i32 [[X1]], [[X2]]
-; IS__TUNIT_OPM-NEXT:    store i32 [[Z]], i32* [[W]], align 4
-; IS__TUNIT_OPM-NEXT:    br label [[RET:%.*]]
-; IS__TUNIT_OPM:       LPAD:
-; IS__TUNIT_OPM-NEXT:    unreachable
-; IS__TUNIT_OPM:       RET:
-; IS__TUNIT_OPM-NEXT:    ret void
+; IS__TUNIT____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@caller
+; IS__TUNIT____-SAME: (i1 [[C:%.*]]) #[[ATTR1]] personality i32 (...)* @__gxx_personality_v0 {
+; IS__TUNIT____-NEXT:    [[Q:%.*]] = alloca i32, align 4
+; IS__TUNIT____-NEXT:    [[W:%.*]] = call align 4 i32* @incdec(i1 [[C]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q]]) #[[ATTR2:[0-9]+]]
+; IS__TUNIT____-NEXT:    [[S1:%.*]] = call { i32, i32 } @foo(i32 noundef 1, i32 noundef 2) #[[ATTR3:[0-9]+]]
+; IS__TUNIT____-NEXT:    [[X1:%.*]] = extractvalue { i32, i32 } [[S1]], 0
+; IS__TUNIT____-NEXT:    [[S2:%.*]] = call { i32, i32 } @foo(i32 noundef 3, i32 noundef 4) #[[ATTR3]]
+; IS__TUNIT____-NEXT:    br label [[OK:%.*]]
+; IS__TUNIT____:       OK:
+; IS__TUNIT____-NEXT:    [[X2:%.*]] = extractvalue { i32, i32 } [[S2]], 0
+; IS__TUNIT____-NEXT:    [[Z:%.*]] = add i32 [[X1]], [[X2]]
+; IS__TUNIT____-NEXT:    store i32 [[Z]], i32* [[Q]], align 4
+; IS__TUNIT____-NEXT:    br label [[RET:%.*]]
+; IS__TUNIT____:       LPAD:
+; IS__TUNIT____-NEXT:    unreachable
+; IS__TUNIT____:       RET:
+; IS__TUNIT____-NEXT:    ret void
 ;
-; IS__TUNIT_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@caller
-; IS__TUNIT_NPM-SAME: (i1 [[C:%.*]]) #[[ATTR1]] personality i32 (...)* @__gxx_personality_v0 {
-; IS__TUNIT_NPM-NEXT:    [[Q:%.*]] = alloca i32, align 4
-; IS__TUNIT_NPM-NEXT:    [[W:%.*]] = call align 4 i32* @incdec(i1 [[C]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q]]) #[[ATTR2:[0-9]+]]
-; IS__TUNIT_NPM-NEXT:    [[S1:%.*]] = call { i32, i32 } @foo(i32 noundef 1, i32 noundef 2) #[[ATTR3:[0-9]+]]
-; IS__TUNIT_NPM-NEXT:    [[X1:%.*]] = extractvalue { i32, i32 } [[S1]], 0
-; IS__TUNIT_NPM-NEXT:    [[S2:%.*]] = call { i32, i32 } @foo(i32 noundef 3, i32 noundef 4) #[[ATTR3]]
-; IS__TUNIT_NPM-NEXT:    br label [[OK:%.*]]
-; IS__TUNIT_NPM:       OK:
-; IS__TUNIT_NPM-NEXT:    [[X2:%.*]] = extractvalue { i32, i32 } [[S2]], 0
-; IS__TUNIT_NPM-NEXT:    [[Z:%.*]] = add i32 [[X1]], [[X2]]
-; IS__TUNIT_NPM-NEXT:    store i32 [[Z]], i32* [[Q]], align 4
-; IS__TUNIT_NPM-NEXT:    br label [[RET:%.*]]
-; IS__TUNIT_NPM:       LPAD:
-; IS__TUNIT_NPM-NEXT:    unreachable
-; IS__TUNIT_NPM:       RET:
-; IS__TUNIT_NPM-NEXT:    ret void
-;
-; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@caller
-; IS__CGSCC_OPM-SAME: (i1 [[C:%.*]]) #[[ATTR1]] personality i32 (...)* @__gxx_personality_v0 {
-; IS__CGSCC_OPM-NEXT:    [[Q:%.*]] = alloca i32, align 4
-; IS__CGSCC_OPM-NEXT:    [[W:%.*]] = call align 4 i32* @incdec(i1 [[C]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q]]) #[[ATTR2:[0-9]+]]
-; IS__CGSCC_OPM-NEXT:    [[S1:%.*]] = call { i32, i32 } @foo(i32 noundef 1, i32 noundef 2) #[[ATTR3:[0-9]+]]
-; IS__CGSCC_OPM-NEXT:    [[X1:%.*]] = extractvalue { i32, i32 } [[S1]], 0
-; IS__CGSCC_OPM-NEXT:    [[S2:%.*]] = call { i32, i32 } @foo(i32 noundef 3, i32 noundef 4) #[[ATTR4:[0-9]+]]
-; IS__CGSCC_OPM-NEXT:    br label [[OK:%.*]]
-; IS__CGSCC_OPM:       OK:
-; IS__CGSCC_OPM-NEXT:    [[X2:%.*]] = extractvalue { i32, i32 } [[S2]], 0
-; IS__CGSCC_OPM-NEXT:    [[Z:%.*]] = add i32 [[X1]], [[X2]]
-; IS__CGSCC_OPM-NEXT:    store i32 [[Z]], i32* [[W]], align 4
-; IS__CGSCC_OPM-NEXT:    br label [[RET:%.*]]
-; IS__CGSCC_OPM:       LPAD:
-; IS__CGSCC_OPM-NEXT:    unreachable
-; IS__CGSCC_OPM:       RET:
-; IS__CGSCC_OPM-NEXT:    ret void
-;
-; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@caller
-; IS__CGSCC_NPM-SAME: (i1 [[C:%.*]]) #[[ATTR1]] personality i32 (...)* @__gxx_personality_v0 {
-; IS__CGSCC_NPM-NEXT:    [[Q:%.*]] = alloca i32, align 4
-; IS__CGSCC_NPM-NEXT:    [[W:%.*]] = call i32* @incdec(i1 [[C]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q]]) #[[ATTR2:[0-9]+]]
-; IS__CGSCC_NPM-NEXT:    [[S1:%.*]] = call { i32, i32 } @foo(i32 noundef 1, i32 noundef 2) #[[ATTR3:[0-9]+]]
-; IS__CGSCC_NPM-NEXT:    [[X1:%.*]] = extractvalue { i32, i32 } [[S1]], 0
-; IS__CGSCC_NPM-NEXT:    [[S2:%.*]] = call { i32, i32 } @foo(i32 noundef 3, i32 noundef 4) #[[ATTR4:[0-9]+]]
-; IS__CGSCC_NPM-NEXT:    br label [[OK:%.*]]
-; IS__CGSCC_NPM:       OK:
-; IS__CGSCC_NPM-NEXT:    [[X2:%.*]] = extractvalue { i32, i32 } [[S2]], 0
-; IS__CGSCC_NPM-NEXT:    [[Z:%.*]] = add i32 [[X1]], [[X2]]
-; IS__CGSCC_NPM-NEXT:    store i32 [[Z]], i32* [[Q]], align 4
-; IS__CGSCC_NPM-NEXT:    br label [[RET:%.*]]
-; IS__CGSCC_NPM:       LPAD:
-; IS__CGSCC_NPM-NEXT:    unreachable
-; IS__CGSCC_NPM:       RET:
-; IS__CGSCC_NPM-NEXT:    ret void
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@caller
+; IS__CGSCC____-SAME: (i1 [[C:%.*]]) #[[ATTR1]] personality i32 (...)* @__gxx_personality_v0 {
+; IS__CGSCC____-NEXT:    [[Q:%.*]] = alloca i32, align 4
+; IS__CGSCC____-NEXT:    [[W:%.*]] = call i32* @incdec(i1 [[C]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q]]) #[[ATTR2:[0-9]+]]
+; IS__CGSCC____-NEXT:    [[S1:%.*]] = call { i32, i32 } @foo(i32 noundef 1, i32 noundef 2) #[[ATTR3:[0-9]+]]
+; IS__CGSCC____-NEXT:    [[X1:%.*]] = extractvalue { i32, i32 } [[S1]], 0
+; IS__CGSCC____-NEXT:    [[S2:%.*]] = call { i32, i32 } @foo(i32 noundef 3, i32 noundef 4) #[[ATTR3]]
+; IS__CGSCC____-NEXT:    br label [[OK:%.*]]
+; IS__CGSCC____:       OK:
+; IS__CGSCC____-NEXT:    [[X2:%.*]] = extractvalue { i32, i32 } [[S2]], 0
+; IS__CGSCC____-NEXT:    [[Z:%.*]] = add i32 [[X1]], [[X2]]
+; IS__CGSCC____-NEXT:    store i32 [[Z]], i32* [[Q]], align 4
+; IS__CGSCC____-NEXT:    br label [[RET:%.*]]
+; IS__CGSCC____:       LPAD:
+; IS__CGSCC____-NEXT:    unreachable
+; IS__CGSCC____:       RET:
+; IS__CGSCC____-NEXT:    ret void
 ;
   %Q = alloca i32
   ;; Call incdec to see if %W is properly replaced by %Q
@@ -172,12 +134,11 @@
 ;.
 ; IS__TUNIT____: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind willreturn }
 ; IS__TUNIT____: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__TUNIT____: attributes #[[ATTR2:[0-9]+]] = { nofree nosync nounwind willreturn }
-; IS__TUNIT____: attributes #[[ATTR3:[0-9]+]] = { nofree nosync nounwind readnone willreturn }
+; IS__TUNIT____: attributes #[[ATTR2]] = { nofree nosync nounwind willreturn }
+; IS__TUNIT____: attributes #[[ATTR3]] = { nofree nosync nounwind readnone willreturn }
 ;.
 ; IS__CGSCC____: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind willreturn }
 ; IS__CGSCC____: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readnone willreturn }
-; IS__CGSCC____: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn }
-; IS__CGSCC____: attributes #[[ATTR3:[0-9]+]] = { readnone willreturn }
-; IS__CGSCC____: attributes #[[ATTR4:[0-9]+]] = { nounwind readnone willreturn }
+; IS__CGSCC____: attributes #[[ATTR2]] = { nosync nounwind willreturn }
+; IS__CGSCC____: attributes #[[ATTR3]] = { nosync nounwind readnone willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/align.ll b/llvm/test/Transforms/Attributor/align.ll
--- a/llvm/test/Transforms/Attributor/align.ll
+++ b/llvm/test/Transforms/Attributor/align.ll
@@ -16,6 +16,7 @@
 ; CHECK: @[[A1:[a-zA-Z0-9_$"\\.-]+]] = common global i8 0, align 8
 ; CHECK: @[[A2:[a-zA-Z0-9_$"\\.-]+]] = common global i8 0, align 16
 ; CHECK: @[[CND:[a-zA-Z0-9_$"\\.-]+]] = external global i1
+; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global i8 0, align 32
 ;.
 define i32* @test1(i32* align 8 %0) #0 {
 ; CHECK: Function Attrs: nofree noinline norecurse nosync nounwind readnone willreturn uwtable
@@ -1114,6 +1115,74 @@
 
 declare void @align4_callee(i8* align(4) %p)
 
+@G = global i8 0, align 32
+
+define internal i8* @aligned_8_return(i8* %a, i1 %c1, i1 %c2) norecurse {
+; NOT_CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; NOT_CGSCC_OPM-LABEL: define {{[^@]+}}@aligned_8_return
+; NOT_CGSCC_OPM-SAME: (i8* noalias nofree readnone align 16 "no-capture-maybe-returned" [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR9]] {
+; NOT_CGSCC_OPM-NEXT:    [[STACK:%.*]] = alloca i8*, align 8
+; NOT_CGSCC_OPM-NEXT:    br i1 [[C1]], label [[T:%.*]], label [[F:%.*]]
+; NOT_CGSCC_OPM:       t:
+; NOT_CGSCC_OPM-NEXT:    [[GEP:%.*]] = getelementptr i8, i8* @G, i32 8
+; NOT_CGSCC_OPM-NEXT:    [[SEL:%.*]] = select i1 [[C2]], i8* [[A]], i8* [[GEP]]
+; NOT_CGSCC_OPM-NEXT:    store i8* [[SEL]], i8** [[STACK]], align 8
+; NOT_CGSCC_OPM-NEXT:    br label [[END:%.*]]
+; NOT_CGSCC_OPM:       f:
+; NOT_CGSCC_OPM-NEXT:    store i8* @G, i8** [[STACK]], align 8
+; NOT_CGSCC_OPM-NEXT:    br label [[END]]
+; NOT_CGSCC_OPM:       end:
+; NOT_CGSCC_OPM-NEXT:    [[L:%.*]] = load i8*, i8** [[STACK]], align 8
+; NOT_CGSCC_OPM-NEXT:    ret i8* [[L]]
+;
+; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@aligned_8_return
+; IS__CGSCC_OPM-SAME: (i8* noalias nofree readnone align 16 "no-capture-maybe-returned" [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR10]] {
+; IS__CGSCC_OPM-NEXT:    [[STACK:%.*]] = alloca i8*, align 8
+; IS__CGSCC_OPM-NEXT:    br i1 [[C1]], label [[T:%.*]], label [[F:%.*]]
+; IS__CGSCC_OPM:       t:
+; IS__CGSCC_OPM-NEXT:    [[GEP:%.*]] = getelementptr i8, i8* @G, i32 8
+; IS__CGSCC_OPM-NEXT:    [[SEL:%.*]] = select i1 [[C2]], i8* [[A]], i8* [[GEP]]
+; IS__CGSCC_OPM-NEXT:    store i8* [[SEL]], i8** [[STACK]], align 8
+; IS__CGSCC_OPM-NEXT:    br label [[END:%.*]]
+; IS__CGSCC_OPM:       f:
+; IS__CGSCC_OPM-NEXT:    store i8* @G, i8** [[STACK]], align 8
+; IS__CGSCC_OPM-NEXT:    br label [[END]]
+; IS__CGSCC_OPM:       end:
+; IS__CGSCC_OPM-NEXT:    [[L:%.*]] = load i8*, i8** [[STACK]], align 8
+; IS__CGSCC_OPM-NEXT:    ret i8* [[L]]
+;
+  %stack = alloca i8*
+  br i1 %c1, label %t, label %f
+t:
+  %gep = getelementptr i8, i8* @G, i32 8
+  %sel = select i1 %c2, i8* %a, i8* %gep
+  store i8* %sel, i8** %stack
+  br label %end
+f:
+  store i8* @G, i8** %stack
+  br label %end
+end:
+  %l = load i8*, i8** %stack
+  ret i8* %l
+}
+
+define i8* @aligned_8_return_caller(i8* align(16) %a, i1 %c1, i1 %c2) {
+; NOT_CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; NOT_CGSCC_OPM-LABEL: define {{[^@]+}}@aligned_8_return_caller
+; NOT_CGSCC_OPM-SAME: (i8* nofree readnone align 16 "no-capture-maybe-returned" [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR9]] {
+; NOT_CGSCC_OPM-NEXT:    [[R:%.*]] = call align 8 i8* @aligned_8_return(i8* noalias nofree readnone align 16 "no-capture-maybe-returned" [[A]], i1 [[C1]], i1 [[C2]]) #[[ATTR12:[0-9]+]]
+; NOT_CGSCC_OPM-NEXT:    ret i8* [[R]]
+;
+; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@aligned_8_return_caller
+; IS__CGSCC_OPM-SAME: (i8* nofree readnone align 16 "no-capture-maybe-returned" [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR10]] {
+; IS__CGSCC_OPM-NEXT:    [[R:%.*]] = call align 8 i8* @aligned_8_return(i8* noalias nofree readnone align 16 "no-capture-maybe-returned" [[A]], i1 [[C1]], i1 [[C2]]) #[[ATTR13:[0-9]+]]
+; IS__CGSCC_OPM-NEXT:    ret i8* [[R]]
+;
+  %r = call i8* @aligned_8_return(i8* %a, i1 %c1, i1 %c2)
+  ret i8* %r
+}
 
 attributes #0 = { nounwind uwtable noinline }
 attributes #1 = { uwtable noinline }
@@ -1131,6 +1200,7 @@
 ; IS__TUNIT____: attributes #[[ATTR9]] = { nofree norecurse nosync nounwind readnone willreturn }
 ; IS__TUNIT____: attributes #[[ATTR10]] = { nofree norecurse nosync nounwind readonly willreturn }
 ; IS__TUNIT____: attributes #[[ATTR11]] = { nofree nosync nounwind readonly willreturn }
+; IS__TUNIT____: attributes #[[ATTR12]] = { nofree nosync nounwind readnone willreturn }
 ;.
 ; IS__CGSCC_OPM: attributes #[[ATTR0]] = { nofree noinline norecurse nosync nounwind readnone willreturn uwtable }
 ; IS__CGSCC_OPM: attributes #[[ATTR1]] = { nofree noinline nosync nounwind readnone willreturn uwtable }
@@ -1145,6 +1215,7 @@
 ; IS__CGSCC_OPM: attributes #[[ATTR10]] = { nofree norecurse nosync nounwind readnone willreturn }
 ; IS__CGSCC_OPM: attributes #[[ATTR11]] = { nofree norecurse nosync nounwind readonly willreturn }
 ; IS__CGSCC_OPM: attributes #[[ATTR12]] = { readonly willreturn }
+; IS__CGSCC_OPM: attributes #[[ATTR13]] = { readnone willreturn }
 ;.
 ; IS__CGSCC_NPM: attributes #[[ATTR0]] = { nofree noinline norecurse nosync nounwind readnone willreturn uwtable }
 ; IS__CGSCC_NPM: attributes #[[ATTR1]] = { noinline norecurse nounwind uwtable }
@@ -1158,4 +1229,5 @@
 ; IS__CGSCC_NPM: attributes #[[ATTR9]] = { nofree norecurse nosync nounwind readnone willreturn }
 ; IS__CGSCC_NPM: attributes #[[ATTR10]] = { nofree norecurse nosync nounwind readonly willreturn }
 ; IS__CGSCC_NPM: attributes #[[ATTR11]] = { readonly willreturn }
+; IS__CGSCC_NPM: attributes #[[ATTR12]] = { readnone willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -10,6 +10,9 @@
 ; TEST 1
 ; take mininimum of return values
 ;
+;.
+; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global i64 0
+;.
 define i32* @test1(i32* dereferenceable(4) %0, double* dereferenceable(8) %1, i1 zeroext %2) local_unnamed_addr {
 ; CHECK: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; CHECK-LABEL: define {{[^@]+}}@test1
@@ -316,7 +319,7 @@
 define void @test8(i8* %ptr) #0 {
 ; IS________OPM: Function Attrs: argmemonly nofree norecurse nosync nounwind writeonly
 ; IS________OPM-LABEL: define {{[^@]+}}@test8
-; IS________OPM-SAME: (i8* nocapture nofree nonnull writeonly [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
+; IS________OPM-SAME: (i8* nocapture nofree writeonly [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
 ; IS________OPM-NEXT:    br label [[TMP1:%.*]]
 ; IS________OPM:       1:
 ; IS________OPM-NEXT:    [[I_0:%.*]] = phi i32 [ 20, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[TMP5:%.*]] ]
@@ -915,6 +918,32 @@
 declare void @unknown_use32(i32*) willreturn nounwind
 declare void @llvm.assume(i1)
 
+@g = global i64 0
+define void @max_offset(i1 %c) {
+; CHECK: Function Attrs: nounwind willreturn
+; CHECK-LABEL: define {{[^@]+}}@max_offset
+; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    br label [[F]]
+; CHECK:       f:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i8* [ getelementptr (i8, i8* bitcast (i64* @g to i8*), i64 2), [[T]] ], [ bitcast (i64* @g to i8*), [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @unknown_use8(i8* noundef align 2 dereferenceable_or_null(6) [[PHI]]) #[[ATTR1]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %bc = bitcast i64* @g to i8*
+  br i1 %c, label %t, label %f
+t:
+  %gep = getelementptr i8, i8* %bc, i64 2
+  br label %f
+f:
+  %phi = phi i8* [%gep, %t], [%bc, %entry]
+  call void @unknown_use8(i8* %phi)
+  ret void
+}
+
 !0 = !{i64 10, i64 100}
 
 ;.
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -220,12 +220,21 @@
 
 ; leave alone a constant-but-invalid alignment
 define void @test3d(i8* %p) {
-; CHECK-LABEL: define {{[^@]+}}@test3d
-; CHECK-SAME; (i8* nocapture [[P:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 noundef 33, i64 noundef 128)
-; CHECK:    tail call void @free(i8* noalias nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; IS________OPM-LABEL: define {{[^@]+}}@test3d
+; IS________OPM-SAME: (i8* nocapture [[P:%.*]]) {
+; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 noundef 33, i64 noundef 128)
+; IS________OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree [[TMP1]], i8* nocapture [[P]])
+; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
+; IS________OPM-NEXT:    ret void
 ;
+; IS________NPM-LABEL: define {{[^@]+}}@test3d
+; IS________NPM-SAME: (i8* nocapture [[P:%.*]]) {
+; IS________NPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 noundef 33, i64 noundef 128)
+; IS________NPM-NEXT:    tail call void @nofree_arg_only(i8* noalias nocapture nofree [[TMP1]], i8* nocapture [[P]])
+; IS________NPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
+; IS________NPM-NEXT:    ret void
+;
+; CHECK-SAME; (i8* nocapture [[P:%.*]]) {
   %1 = tail call noalias i8* @aligned_alloc(i64 33, i64 128)
   tail call void @nofree_arg_only(i8* %1, i8* %p)
   tail call void @free(i8* %1)
@@ -578,8 +587,9 @@
 ; IS________NPM-NEXT:    [[TMP14]] = add nsw i32 [[DOT1]], 1
 ; IS________NPM-NEXT:    br label [[TMP8]]
 ; IS________NPM:       15:
-; IS________NPM-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP3]], align 4
-; IS________NPM-NEXT:    ret i32 [[TMP16]]
+; IS________NPM-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP3]] to i8*
+; IS________NPM-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS________NPM-NEXT:    ret i32 [[TMP17]]
 ;
   %2 = call noalias i8* @malloc(i64 4)
   %3 = bitcast i8* %2 to i32*
diff --git a/llvm/test/Transforms/Attributor/internal-noalias.ll b/llvm/test/Transforms/Attributor/internal-noalias.ll
--- a/llvm/test/Transforms/Attributor/internal-noalias.ll
+++ b/llvm/test/Transforms/Attributor/internal-noalias.ll
@@ -144,12 +144,17 @@
 }
 
 define internal i32 @noalias_args_argmem_rn(i32* %A, i32* %B) #1 {
-; CHECK: Function Attrs: argmemonly nofree noinline norecurse nosync nounwind willreturn uwtable
-; CHECK-LABEL: define {{[^@]+}}@noalias_args_argmem_rn
-; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[T0:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[B]], align 4
-; CHECK-NEXT:    ret i32 [[T0]]
+; IS__TUNIT____: Function Attrs: argmemonly nofree noinline norecurse nosync nounwind willreturn uwtable
+; IS__TUNIT____-LABEL: define {{[^@]+}}@noalias_args_argmem_rn
+; IS__TUNIT____-SAME: (i32* noalias nocapture nofree noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR1]] {
+; IS__TUNIT____-NEXT:    [[T0:%.*]] = load i32, i32* [[B]], align 4
+; IS__TUNIT____-NEXT:    ret i32 [[T0]]
+;
+; IS__CGSCC____: Function Attrs: nofree noinline norecurse nosync nounwind readnone willreturn uwtable
+; IS__CGSCC____-LABEL: define {{[^@]+}}@noalias_args_argmem_rn
+; IS__CGSCC____-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR2]] {
+; IS__CGSCC____-NEXT:    [[T0:%.*]] = load i32, i32* undef, align 4
+; IS__CGSCC____-NEXT:    ret i32 undef
 ;
   %t0 = load i32, i32* %B, align 4
   store i32 0, i32* %B
@@ -170,8 +175,7 @@
 ; IS__CGSCC____-SAME: () #[[ATTR3]] {
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 5, i32* [[B]], align 4
-; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_rn(i32* noalias nocapture nofree noundef nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR6:[0-9]+]]
-; IS__CGSCC____-NEXT:    ret i32 [[CALL]]
+; IS__CGSCC____-NEXT:    ret i32 5
 ;
   %B = alloca i32, align 4
   store i32 5, i32* %B, align 4
@@ -194,5 +198,4 @@
 ; IS__CGSCC____: attributes #[[ATTR3]] = { nofree norecurse nosync nounwind readnone willreturn }
 ; IS__CGSCC____: attributes #[[ATTR4]] = { nounwind readonly }
 ; IS__CGSCC____: attributes #[[ATTR5]] = { nosync nounwind readonly }
-; IS__CGSCC____: attributes #[[ATTR6]] = { nounwind willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
--- a/llvm/test/Transforms/Attributor/liveness.ll
+++ b/llvm/test/Transforms/Attributor/liveness.ll
@@ -2281,7 +2281,7 @@
   call void %fp(i32* %a, i32* %b, i32* %a, i64 -1, i32** null)
   ret void
 }
-; FIXME: We have to prevent the propagation of %fp in the new pm CGSCC pass until the CallGraphUpdater can handle the new call edge.
+
 define internal void @call_via_pointer_with_dead_args_internal_a(i32* %a, i32* %b, void (i32*, i32*, i32*, i64, i32**)* %fp) {
 ; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_internal_a
 ; NOT_CGSCC_NPM-SAME: (i32* [[A:%.*]], i32* noundef nonnull align 128 dereferenceable(4) [[B:%.*]]) {
@@ -2289,8 +2289,8 @@
 ; NOT_CGSCC_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_internal_a
-; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* noundef nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull [[FP:%.*]]) {
-; IS__CGSCC____-NEXT:    call void [[FP]](i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[B]], i32* [[A]], i64 -1, i32** null)
+; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* noalias nocapture nofree nonnull readnone align 128 dereferenceable(4) [[B:%.*]]) {
+; IS__CGSCC____-NEXT:    call void @called_via_pointer(i32* [[A]], i32* noalias nocapture nofree nonnull readnone align 128 dereferenceable(4) undef, i32* noalias nocapture nofree readnone undef, i64 undef, i32** noalias nocapture nofree readnone align 4294967296 undef)
 ; IS__CGSCC____-NEXT:    ret void
 ;
   call void %fp(i32* %a, i32* %b, i32* %a, i64 -1, i32** null)
@@ -2303,8 +2303,8 @@
 ; NOT_CGSCC_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_internal_b
-; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* noundef nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull [[FP:%.*]]) {
-; IS__CGSCC____-NEXT:    call void [[FP]](i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[B]], i32* [[A]], i64 -1, i32** null)
+; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* noalias nocapture nofree nonnull readnone align 128 dereferenceable(4) [[B:%.*]]) {
+; IS__CGSCC____-NEXT:    call void @called_via_pointer_internal_2(i32* [[A]])
 ; IS__CGSCC____-NEXT:    ret void
 ;
   call void %fp(i32* %a, i32* %b, i32* %a, i64 -1, i32** null)
@@ -2327,12 +2327,10 @@
 ; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* [[B:%.*]]) {
 ; IS__CGSCC____-NEXT:    [[PTR1:%.*]] = alloca i32, align 128
 ; IS__CGSCC____-NEXT:    [[PTR2:%.*]] = alloca i32, align 128
-; IS__CGSCC____-NEXT:    [[PTR3:%.*]] = alloca i32, align 128
-; IS__CGSCC____-NEXT:    [[PTR4:%.*]] = alloca i32, align 128
 ; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR1]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull @called_via_pointer)
 ; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR2]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull @called_via_pointer_internal_1)
-; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR3]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull @called_via_pointer)
-; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR4]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull @called_via_pointer_internal_2)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* noalias nocapture nofree nonnull readnone align 128 dereferenceable(4) undef)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* noalias nocapture nofree nonnull readnone align 128 dereferenceable(4) undef)
 ; IS__CGSCC____-NEXT:    ret void
 ;
   %ptr1 = alloca i32, align 128
@@ -2373,12 +2371,19 @@
 }
 ; FIXME: Figure out why the MODULE has the unused arguments still
 define internal void @called_via_pointer_internal_2(i32* %a, i32* %b, i32* %c, i64 %d, i32** %e) {
-; CHECK-LABEL: define {{[^@]+}}@called_via_pointer_internal_2
-; CHECK-SAME: (i32* [[A:%.*]], i32* nocapture nofree readnone [[B:%.*]], i32* nocapture nofree readnone [[C:%.*]], i64 [[D:%.*]], i32** nocapture nofree readnone [[E:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void @use_i32p(i32* [[A]])
-; CHECK-NEXT:    tail call void @use_i32p(i32* [[A]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@called_via_pointer_internal_2
+; NOT_CGSCC_NPM-SAME: (i32* [[A:%.*]], i32* nocapture nofree readnone [[B:%.*]], i32* nocapture nofree readnone [[C:%.*]], i64 [[D:%.*]], i32** nocapture nofree readnone [[E:%.*]]) {
+; NOT_CGSCC_NPM-NEXT:  entry:
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_i32p(i32* [[A]])
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_i32p(i32* [[A]])
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@called_via_pointer_internal_2
+; IS__CGSCC____-SAME: (i32* [[A:%.*]]) {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    tail call void @use_i32p(i32* [[A]])
+; IS__CGSCC____-NEXT:    tail call void @use_i32p(i32* [[A]])
+; IS__CGSCC____-NEXT:    ret void
 ;
 entry:
   tail call void @use_i32p(i32* %a)
diff --git a/llvm/test/Transforms/Attributor/noalias.ll b/llvm/test/Transforms/Attributor/noalias.ll
--- a/llvm/test/Transforms/Attributor/noalias.ll
+++ b/llvm/test/Transforms/Attributor/noalias.ll
@@ -402,23 +402,23 @@
 ; IS________OPM-LABEL: define {{[^@]+}}@test12_4() {
 ; IS________OPM-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
 ; IS________OPM-NEXT:    [[B:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
-; IS________OPM-NEXT:    [[A_0:%.*]] = getelementptr i8, i8* [[A]], i64 0
 ; IS________OPM-NEXT:    [[A_1:%.*]] = getelementptr i8, i8* [[A]], i64 1
-; IS________OPM-NEXT:    [[B_0:%.*]] = getelementptr i8, i8* [[B]], i64 0
 ; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[B]])
-; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A_0]])
+; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A]])
 ; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A_1]])
-; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A_0]], i8* nocapture [[B_0]])
+; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[B]])
 ; IS________OPM-NEXT:    ret void
 ;
 ; NOT_TUNIT_OPM-LABEL: define {{[^@]+}}@test12_4() {
 ; NOT_TUNIT_OPM-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
 ; NOT_TUNIT_OPM-NEXT:    [[B:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
+; NOT_TUNIT_OPM-NEXT:    [[A_0:%.*]] = getelementptr i8, i8* [[A]], i64 0
 ; NOT_TUNIT_OPM-NEXT:    [[A_1:%.*]] = getelementptr i8, i8* [[A]], i64 1
+; NOT_TUNIT_OPM-NEXT:    [[B_0:%.*]] = getelementptr i8, i8* [[B]], i64 0
 ; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* noalias nocapture [[A]], i8* noalias nocapture [[B]])
-; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A]])
+; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A_0]])
 ; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A_1]])
-; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[B]])
+; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* nocapture [[A_0]], i8* nocapture [[B_0]])
 ; NOT_TUNIT_OPM-NEXT:    ret void
 ;
   %A = tail call noalias i8* @malloc(i64 4)
@@ -452,14 +452,14 @@
 define void @test13_use_noalias(){
 ; IS________OPM-LABEL: define {{[^@]+}}@test13_use_noalias() {
 ; IS________OPM-NEXT:    [[M1:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
-; IS________OPM-NEXT:    [[C1:%.*]] = bitcast i8* [[M1]] to i16*
-; IS________OPM-NEXT:    [[C2:%.*]] = bitcast i16* [[C1]] to i8*
-; IS________OPM-NEXT:    call void @use_i8_internal(i8* noalias nocapture [[C2]])
+; IS________OPM-NEXT:    call void @use_i8_internal(i8* noalias nocapture [[M1]])
 ; IS________OPM-NEXT:    ret void
 ;
 ; NOT_TUNIT_OPM-LABEL: define {{[^@]+}}@test13_use_noalias() {
 ; NOT_TUNIT_OPM-NEXT:    [[M1:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
-; NOT_TUNIT_OPM-NEXT:    call void @use_i8_internal(i8* noalias nocapture [[M1]])
+; NOT_TUNIT_OPM-NEXT:    [[C1:%.*]] = bitcast i8* [[M1]] to i16*
+; NOT_TUNIT_OPM-NEXT:    [[C2:%.*]] = bitcast i16* [[C1]] to i8*
+; NOT_TUNIT_OPM-NEXT:    call void @use_i8_internal(i8* noalias nocapture [[C2]])
 ; NOT_TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test13_use_noalias()
@@ -478,17 +478,17 @@
 define void @test13_use_alias(){
 ; IS________OPM-LABEL: define {{[^@]+}}@test13_use_alias() {
 ; IS________OPM-NEXT:    [[M1:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
-; IS________OPM-NEXT:    [[C1:%.*]] = bitcast i8* [[M1]] to i16*
-; IS________OPM-NEXT:    [[C2A:%.*]] = bitcast i16* [[C1]] to i8*
-; IS________OPM-NEXT:    [[C2B:%.*]] = bitcast i16* [[C1]] to i8*
-; IS________OPM-NEXT:    call void @use_i8_internal(i8* nocapture [[C2A]])
-; IS________OPM-NEXT:    call void @use_i8_internal(i8* nocapture [[C2B]])
+; IS________OPM-NEXT:    call void @use_i8_internal(i8* nocapture [[M1]])
+; IS________OPM-NEXT:    call void @use_i8_internal(i8* nocapture [[M1]])
 ; IS________OPM-NEXT:    ret void
 ;
 ; NOT_TUNIT_OPM-LABEL: define {{[^@]+}}@test13_use_alias() {
 ; NOT_TUNIT_OPM-NEXT:    [[M1:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
-; NOT_TUNIT_OPM-NEXT:    call void @use_i8_internal(i8* nocapture [[M1]])
-; NOT_TUNIT_OPM-NEXT:    call void @use_i8_internal(i8* nocapture [[M1]])
+; NOT_TUNIT_OPM-NEXT:    [[C1:%.*]] = bitcast i8* [[M1]] to i16*
+; NOT_TUNIT_OPM-NEXT:    [[C2A:%.*]] = bitcast i16* [[C1]] to i8*
+; NOT_TUNIT_OPM-NEXT:    [[C2B:%.*]] = bitcast i16* [[C1]] to i8*
+; NOT_TUNIT_OPM-NEXT:    call void @use_i8_internal(i8* nocapture [[C2A]])
+; NOT_TUNIT_OPM-NEXT:    call void @use_i8_internal(i8* nocapture [[C2B]])
 ; NOT_TUNIT_OPM-NEXT:    ret void
 ;
   %m1 = tail call noalias i8* @malloc(i64 4)
diff --git a/llvm/test/Transforms/Attributor/nodelete.ll b/llvm/test/Transforms/Attributor/nodelete.ll
--- a/llvm/test/Transforms/Attributor/nodelete.ll
+++ b/llvm/test/Transforms/Attributor/nodelete.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
-; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
+; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=4 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
 ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
 
@@ -32,6 +32,7 @@
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@f2
 ; IS__CGSCC____-SAME: () #[[ATTR0]] align 2 {
 ; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    [[TMP0:%.*]] = bitcast %a* undef to %b*
 ; IS__CGSCC____-NEXT:    ret i64 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/nonnull.ll b/llvm/test/Transforms/Attributor/nonnull.ll
--- a/llvm/test/Transforms/Attributor/nonnull.ll
+++ b/llvm/test/Transforms/Attributor/nonnull.ll
@@ -257,7 +257,7 @@
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i8* [ [[RET]], [[ENTRY:%.*]] ], [ [[PHI]], [[LOOP]] ]
 ; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    ret i8* [[PHI]]
+; CHECK-NEXT:    ret i8* [[RET]]
 ;
 entry:
   %ret = call i8* @ret_nonnull()
diff --git a/llvm/test/Transforms/Attributor/range.ll b/llvm/test/Transforms/Attributor/range.ll
--- a/llvm/test/Transforms/Attributor/range.ll
+++ b/llvm/test/Transforms/Attributor/range.ll
@@ -967,8 +967,7 @@
 ; IS__TUNIT_OPM-NEXT:    [[R2:%.*]] = call i32 @ret1or2(i1 [[C2]]) #[[ATTR5]], !range [[RNG4]]
 ; IS__TUNIT_OPM-NEXT:    [[A:%.*]] = add i32 [[R1]], [[R2]]
 ; IS__TUNIT_OPM-NEXT:    [[I1:%.*]] = icmp sle i32 [[A]], 3
-; IS__TUNIT_OPM-NEXT:    [[F:%.*]] = and i1 [[I1]], true
-; IS__TUNIT_OPM-NEXT:    ret i1 [[F]]
+; IS__TUNIT_OPM-NEXT:    ret i1 [[I1]]
 ;
 ; IS________NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS________NPM-LABEL: define {{[^@]+}}@callee_range_2
@@ -986,8 +985,7 @@
 ; IS__CGSCC_OPM-NEXT:    [[R2:%.*]] = call i32 @ret1or2(i1 [[C2]]) #[[ATTR5]], !range [[RNG5]]
 ; IS__CGSCC_OPM-NEXT:    [[A:%.*]] = add i32 [[R1]], [[R2]]
 ; IS__CGSCC_OPM-NEXT:    [[I1:%.*]] = icmp sle i32 [[A]], 3
-; IS__CGSCC_OPM-NEXT:    [[F:%.*]] = and i1 [[I1]], true
-; IS__CGSCC_OPM-NEXT:    ret i1 [[F]]
+; IS__CGSCC_OPM-NEXT:    ret i1 [[I1]]
 ;
   %r1 = call i32 @ret1or2(i1 %c1)
   %r2 = call i32 @ret1or2(i1 %c2)
diff --git a/llvm/test/Transforms/Attributor/value-simplify-gpu.ll b/llvm/test/Transforms/Attributor/value-simplify-gpu.ll
--- a/llvm/test/Transforms/Attributor/value-simplify-gpu.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-gpu.ll
@@ -69,7 +69,6 @@
 ; IS__CGSCC____-NEXT:    call void @level2Kernelb() #[[ATTR4]]
 ; IS__CGSCC____-NEXT:    br label [[IF_END]]
 ; IS__CGSCC____:       if.end:
-; IS__CGSCC____-NEXT:    call void @level2Kernelall_late() #[[ATTR6:[0-9]+]]
 ; IS__CGSCC____-NEXT:    ret void
 ;
 entry:
@@ -112,7 +111,7 @@
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[TMP0:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableKernel to i32*), align 4
 ; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = load i32, i32* @ReachableKernelAS0, align 4
-; IS__TUNIT____-NEXT:    call void @use(i32 [[TMP0]], i32 [[TMP1]], i32 noundef 42) #[[ATTR6:[0-9]+]]
+; IS__TUNIT____-NEXT:    call void @use(i32 noundef [[TMP0]], i32 noundef [[TMP1]], i32 noundef 42) #[[ATTR6:[0-9]+]]
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: norecurse nosync nounwind
@@ -122,7 +121,7 @@
 ; IS__CGSCC____-NEXT:    [[TMP0:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableKernel to i32*), align 4
 ; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = load i32, i32* @ReachableKernelAS0, align 4
 ; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @UnreachableKernel to i32*), align 4
-; IS__CGSCC____-NEXT:    call void @use(i32 [[TMP0]], i32 [[TMP1]], i32 noundef 42) #[[ATTR4]]
+; IS__CGSCC____-NEXT:    call void @use(i32 noundef [[TMP0]], i32 noundef [[TMP1]], i32 noundef 42) #[[ATTR4]]
 ; IS__CGSCC____-NEXT:    ret void
 ;
 entry:
@@ -140,7 +139,7 @@
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[TMP0:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableKernel to i32*), align 4
 ; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = load i32, i32* @ReachableKernelAS0, align 4
-; IS__TUNIT____-NEXT:    call void @use(i32 [[TMP0]], i32 [[TMP1]], i32 noundef 42) #[[ATTR6]]
+; IS__TUNIT____-NEXT:    call void @use(i32 noundef [[TMP0]], i32 noundef [[TMP1]], i32 noundef 42) #[[ATTR6]]
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: norecurse nosync nounwind
@@ -150,7 +149,7 @@
 ; IS__CGSCC____-NEXT:    [[TMP0:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableKernel to i32*), align 4
 ; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = load i32, i32* @ReachableKernelAS0, align 4
 ; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @UnreachableKernel to i32*), align 4
-; IS__CGSCC____-NEXT:    call void @use(i32 [[TMP0]], i32 [[TMP1]], i32 noundef 42) #[[ATTR4]]
+; IS__CGSCC____-NEXT:    call void @use(i32 noundef [[TMP0]], i32 noundef [[TMP1]], i32 noundef 42) #[[ATTR4]]
 ; IS__CGSCC____-NEXT:    ret void
 ;
 entry:
@@ -236,7 +235,7 @@
 ; IS__CGSCC_OPM-NEXT:    call void @level2b(i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) undef) #[[ATTR4]]
 ; IS__CGSCC_OPM-NEXT:    br label [[IF_END]]
 ; IS__CGSCC_OPM:       if.end:
-; IS__CGSCC_OPM-NEXT:    call void @level2all_late(i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) undef) #[[ATTR7:[0-9]+]]
+; IS__CGSCC_OPM-NEXT:    call void @level2all_late(i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) undef) #[[ATTR6:[0-9]+]]
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM: Function Attrs: norecurse nosync nounwind
@@ -251,10 +250,10 @@
 ; IS__CGSCC_NPM-NEXT:    call void @level2a(i32 undef) #[[ATTR4]]
 ; IS__CGSCC_NPM-NEXT:    br label [[IF_END:%.*]]
 ; IS__CGSCC_NPM:       if.else:
-; IS__CGSCC_NPM-NEXT:    call void @level2b(i32 undef) #[[ATTR7:[0-9]+]]
+; IS__CGSCC_NPM-NEXT:    call void @level2b(i32 undef) #[[ATTR6:[0-9]+]]
 ; IS__CGSCC_NPM-NEXT:    br label [[IF_END]]
 ; IS__CGSCC_NPM:       if.end:
-; IS__CGSCC_NPM-NEXT:    call void @level2all_late(i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) undef) #[[ATTR8:[0-9]+]]
+; IS__CGSCC_NPM-NEXT:    call void @level2all_late(i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) undef) #[[ATTR7:[0-9]+]]
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -304,17 +303,17 @@
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[TMP0:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableNonKernel to i32*), align 4
 ; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @UnreachableNonKernel to i32*), align 4
-; IS__TUNIT____-NEXT:    call void @use(i32 [[TMP0]], i32 [[TMP1]], i32 17) #[[ATTR6]]
+; IS__TUNIT____-NEXT:    call void @use(i32 noundef [[TMP0]], i32 noundef [[TMP1]], i32 17) #[[ATTR6]]
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM: Function Attrs: norecurse nosync nounwind
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@level2a
-; IS__CGSCC_OPM-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[ADDR:%.*]]) #[[ATTR1]] {
+; IS__CGSCC_OPM-SAME: (i32* noalias nocapture nofree nonnull readnone align 4294967296 dereferenceable(4) [[ADDR:%.*]]) #[[ATTR1]] {
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableNonKernel to i32*), align 4
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @UnreachableNonKernel to i32*), align 4
-; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = load i32, i32* undef, align 4
-; IS__CGSCC_OPM-NEXT:    call void @use(i32 [[TMP0]], i32 [[TMP1]], i32 17) #[[ATTR4]]
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = load i32, i32* undef, align 4294967296
+; IS__CGSCC_OPM-NEXT:    call void @use(i32 noundef [[TMP0]], i32 noundef [[TMP1]], i32 17) #[[ATTR4]]
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM: Function Attrs: norecurse nosync nounwind
@@ -325,7 +324,7 @@
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableNonKernel to i32*), align 4
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @UnreachableNonKernel to i32*), align 4
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ADDR_PRIV]], align 4
-; IS__CGSCC_NPM-NEXT:    call void @use(i32 [[TMP1]], i32 [[TMP2]], i32 17) #[[ATTR4]]
+; IS__CGSCC_NPM-NEXT:    call void @use(i32 noundef [[TMP1]], i32 noundef [[TMP2]], i32 17) #[[ATTR4]]
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -343,17 +342,17 @@
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[TMP0:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableNonKernel to i32*), align 4
 ; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @UnreachableNonKernel to i32*), align 4
-; IS__TUNIT____-NEXT:    call void @use(i32 [[TMP0]], i32 [[TMP1]], i32 17) #[[ATTR6]]
+; IS__TUNIT____-NEXT:    call void @use(i32 noundef [[TMP0]], i32 noundef [[TMP1]], i32 17) #[[ATTR6]]
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM: Function Attrs: norecurse nosync nounwind
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@level2b
-; IS__CGSCC_OPM-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[ADDR:%.*]]) #[[ATTR1]] {
+; IS__CGSCC_OPM-SAME: (i32* noalias nocapture nofree nonnull readnone align 4294967296 dereferenceable(4) [[ADDR:%.*]]) #[[ATTR1]] {
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableNonKernel to i32*), align 4
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @UnreachableNonKernel to i32*), align 4
-; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = load i32, i32* undef, align 4
-; IS__CGSCC_OPM-NEXT:    call void @use(i32 [[TMP0]], i32 [[TMP1]], i32 17) #[[ATTR4]]
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = load i32, i32* undef, align 4294967296
+; IS__CGSCC_OPM-NEXT:    call void @use(i32 noundef [[TMP0]], i32 noundef [[TMP1]], i32 17) #[[ATTR4]]
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM: Function Attrs: norecurse nosync nounwind
@@ -364,7 +363,7 @@
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @ReachableNonKernel to i32*), align 4
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* addrspacecast (i32 addrspace(3)* @UnreachableNonKernel to i32*), align 4
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ADDR_PRIV]], align 4
-; IS__CGSCC_NPM-NEXT:    call void @use(i32 [[TMP1]], i32 [[TMP2]], i32 17) #[[ATTR4]]
+; IS__CGSCC_NPM-NEXT:    call void @use(i32 noundef [[TMP1]], i32 noundef [[TMP2]], i32 17) #[[ATTR4]]
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -413,8 +412,7 @@
 ; IS__CGSCC_OPM: attributes #[[ATTR3]] = { nofree norecurse nosync nounwind readnone willreturn }
 ; IS__CGSCC_OPM: attributes #[[ATTR4]] = { nounwind }
 ; IS__CGSCC_OPM: attributes #[[ATTR5]] = { nounwind willreturn writeonly }
-; IS__CGSCC_OPM: attributes #[[ATTR6]] = { nounwind readnone }
-; IS__CGSCC_OPM: attributes #[[ATTR7]] = { nounwind writeonly }
+; IS__CGSCC_OPM: attributes #[[ATTR6]] = { nounwind writeonly }
 ;.
 ; IS__CGSCC_NPM: attributes #[[ATTR0]] = { norecurse nosync nounwind "kernel" }
 ; IS__CGSCC_NPM: attributes #[[ATTR1]] = { norecurse nosync nounwind }
@@ -422,7 +420,6 @@
 ; IS__CGSCC_NPM: attributes #[[ATTR3]] = { nofree norecurse nosync nounwind readnone willreturn }
 ; IS__CGSCC_NPM: attributes #[[ATTR4]] = { nounwind }
 ; IS__CGSCC_NPM: attributes #[[ATTR5]] = { nounwind willreturn writeonly }
-; IS__CGSCC_NPM: attributes #[[ATTR6]] = { nounwind readnone }
-; IS__CGSCC_NPM: attributes #[[ATTR7]] = { nosync nounwind }
-; IS__CGSCC_NPM: attributes #[[ATTR8]] = { nosync nounwind writeonly }
+; IS__CGSCC_NPM: attributes #[[ATTR6]] = { nosync nounwind }
+; IS__CGSCC_NPM: attributes #[[ATTR7]] = { nosync nounwind writeonly }
 ;.
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -3183,8 +3183,7 @@
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
 ; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
-; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @free(i8* noundef [[TMP2]]) #[[ATTR17]]
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* noundef [[CALL]]) #[[ATTR17]]
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@round_trip_malloc
@@ -3203,8 +3202,7 @@
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
 ; IS__CGSCC_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
-; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @free(i8* noundef [[TMP2]]) #[[ATTR16]]
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* noundef [[CALL]]) #[[ATTR16]]
 ; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
@@ -3224,8 +3222,7 @@
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
 ; IS__TUNIT_OPM-NEXT:    store i32 7, i32* [[TMP0]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
-; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @free(i8* noundef [[TMP2]]) #[[ATTR17]]
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* noundef [[CALL]]) #[[ATTR17]]
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@round_trip_malloc_constant() {
@@ -3238,8 +3235,7 @@
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
 ; IS__CGSCC_OPM-NEXT:    store i32 7, i32* [[TMP0]], align 4
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
-; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @free(i8* noundef [[TMP2]]) #[[ATTR16]]
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* noundef [[CALL]]) #[[ATTR16]]
 ; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
@@ -3417,6 +3413,7 @@
 ; IS________NPM-NEXT:    br label [[IF_END]]
 ; IS________NPM:       if.end:
 ; IS________NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS________NPM-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP1]] to i8*
 ; IS________NPM-NEXT:    ret i32 [[TMP2]]
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@conditional_calloc
@@ -3473,10 +3470,12 @@
 ; IS________NPM-NEXT:  entry:
 ; IS________NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
 ; IS________NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[TMP0]], i8 0, i64 4, i1 false)
+; IS________NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
 ; IS________NPM-NEXT:    br i1 [[C]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; IS________NPM:       if.then:
 ; IS________NPM-NEXT:    br label [[IF_END]]
 ; IS________NPM:       if.end:
+; IS________NPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
 ; IS________NPM-NEXT:    ret i32 0
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@conditional_calloc_zero
diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll
--- a/llvm/test/Transforms/Attributor/value-simplify.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify.ll
@@ -903,7 +903,9 @@
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test_callee_is_undef
 ; IS__CGSCC____-SAME: (void (i32)* nocapture nofree [[FN:%.*]]) {
-; IS__CGSCC____-NEXT:    unreachable
+; IS__CGSCC____-NEXT:    call void @callee_is_undef()
+; IS__CGSCC____-NEXT:    call void @unknown_calle_arg_is_undef(void (i32)* nocapture nofree noundef nonnull [[FN]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   call void @callee_is_undef(void ()* undef)
   call void @unknown_calle_arg_is_undef(void (i32)* %fn, i32 undef)
@@ -911,14 +913,9 @@
 }
 define internal void @callee_is_undef(void ()* %fn) {
 ;
-; IS__TUNIT____-LABEL: define {{[^@]+}}@callee_is_undef() {
-; IS__TUNIT____-NEXT:    call void undef()
-; IS__TUNIT____-NEXT:    ret void
-;
-; IS__CGSCC____-LABEL: define {{[^@]+}}@callee_is_undef
-; IS__CGSCC____-SAME: (void ()* nocapture nofree noundef nonnull [[FN:%.*]]) {
-; IS__CGSCC____-NEXT:    call void [[FN]]()
-; IS__CGSCC____-NEXT:    ret void
+; CHECK-LABEL: define {{[^@]+}}@callee_is_undef() {
+; CHECK-NEXT:    call void undef()
+; CHECK-NEXT:    ret void
 ;
   call void %fn()
   ret void
diff --git a/llvm/test/Transforms/ConstraintElimination/add-nuw.ll b/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
--- a/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
@@ -454,3 +454,127 @@
 }
 
 declare void @use(i1)
+
+define i1 @add_nuw_neg_pr54224_i16(i16 %a) {
+; CHECK-LABEL: @add_nuw_neg_pr54224_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NEG2:%.*]] = add nuw i16 [[A:%.*]], -305
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 0, [[NEG2]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i16 [[A]], 0
+; CHECK-NEXT:    ret i1 false
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_3]]
+;
+entry:
+  %neg2 = add nuw i16 %a, -305
+  %c.1 = icmp ugt i16 0, %neg2
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %c.2 = icmp ugt i16 %a, 0
+  ret i1 %c.2
+
+exit.2:
+  %c.3 = icmp ugt i16 %a, 0
+  ret i1 %c.3
+}
+
+define i1 @add_nuw_neg_pr54224_i64(i64 %a) {
+; CHECK-LABEL: @add_nuw_neg_pr54224_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NEG2:%.*]] = add nuw i64 [[A:%.*]], -305
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i64 0, [[NEG2]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i64 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i64 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_3]]
+;
+entry:
+  %neg2 = add nuw i64 %a, -305
+  %c.1 = icmp ugt i64 0, %neg2
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %c.2 = icmp ugt i64 %a, 0
+  ret i1 %c.2
+
+exit.2:
+  %c.3 = icmp ugt i64 %a, 0
+  ret i1 %c.3
+}
+
+define i1 @add_nuw_neg2_i8(i8 %a) {
+; CHECK-LABEL: @add_nuw_neg2_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NEG2:%.*]] = add nuw i8 [[A:%.*]], -4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[NEG2]], -2
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[A]], 2
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i8 [[A]], 1
+; CHECK-NEXT:    [[RES_1:%.*]] = xor i1 true, [[C_2]]
+; CHECK-NEXT:    ret i1 [[RES_1]]
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ult i8 [[A]], 3
+; CHECK-NEXT:    [[F_1:%.*]] = icmp ult i8 [[A]], 2
+; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[C_3]], false
+; CHECK-NEXT:    ret i1 [[RES_2]]
+;
+entry:
+  %neg2 = add nuw i8 %a, -4
+  %c.1 = icmp ult i8 %neg2, -2
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %t.1 = icmp ult i8 %a, 2
+  %c.2 = icmp ult i8 %a, 1
+  %res.1 = xor i1 %t.1, %c.2
+  ret i1 %res.1
+
+exit.2:
+  %c.3 = icmp ult i8 %a, 3
+  %f.1 = icmp ult i8 %a, 2
+  %res.2 = xor i1 %c.3, %f.1
+  ret i1 %res.2
+}
+
+define i1 @add_nuw_neg2_i64(i64 %a) {
+; CHECK-LABEL: @add_nuw_neg2_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NEG2:%.*]] = add nuw i64 [[A:%.*]], -4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i64 [[NEG2]], -2
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i64 [[A]], 2
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i64 [[A]], 1
+; CHECK-NEXT:    [[RES_1:%.*]] = xor i1 [[T_1]], [[C_2]]
+; CHECK-NEXT:    ret i1 [[RES_1]]
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ult i64 [[A]], 3
+; CHECK-NEXT:    [[F_1:%.*]] = icmp ult i64 [[A]], 2
+; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[C_3]], [[F_1]]
+; CHECK-NEXT:    ret i1 [[RES_2]]
+;
+entry:
+  %neg2 = add nuw i64 %a, -4
+  %c.1 = icmp ult i64 %neg2, -2
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %t.1 = icmp ult i64 %a, 2
+  %c.2 = icmp ult i64 %a, 1
+  %res.1 = xor i1 %t.1, %c.2
+  ret i1 %res.1
+
+exit.2:
+  %c.3 = icmp ult i64 %a, 3
+  %f.1 = icmp ult i64 %a, 2
+  %res.2 = xor i1 %c.3, %f.1
+  ret i1 %res.2
+}
diff --git a/llvm/test/Transforms/ConstraintElimination/assumes.ll b/llvm/test/Transforms/ConstraintElimination/assumes.ll
--- a/llvm/test/Transforms/ConstraintElimination/assumes.ll
+++ b/llvm/test/Transforms/ConstraintElimination/assumes.ll
@@ -152,6 +152,34 @@
   ret i1 %res.4
 }
 
+; Test case from PR54217.
+define i1 @assume_does_not_dominates_successor_with_may_unwind_call_before_assume(i16 %a, i1 %i.0) {
+; CHECK-LABEL: @assume_does_not_dominates_successor_with_may_unwind_call_before_assume(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[I_0:%.*]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @may_unwind()
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i16 [[A:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i16 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_2]]
+;
+entry:
+  br i1 %i.0, label %exit, label %if.then
+
+if.then:
+  call void @may_unwind()
+  %c.1 = icmp eq i16 %a, 0
+  call void @llvm.assume(i1 %c.1)
+  br label %exit
+
+exit:
+  %c.2 = icmp eq i16 %a, 0
+  ret i1 %c.2
+}
+
 define i1 @assume_single_bb(i8 %a, i8 %b, i1 %c) {
 ; CHECK-LABEL: @assume_single_bb(
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[A:%.*]], 1
@@ -489,3 +517,4 @@
   %res.2 = xor i1 %res.1, %c.2
   ret i1 %res.2
 }
+
diff --git a/llvm/test/Transforms/ConstraintElimination/pr54228-variable-name-order.ll b/llvm/test/Transforms/ConstraintElimination/pr54228-variable-name-order.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/pr54228-variable-name-order.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=constraint-elimination -S %s | FileCheck %s
+
+
+declare void @fn()
+
+define i1 @test_pr54228(i32 %a, i32 %b, i1 %i.0, i1 %i.1) {
+; CHECK-LABEL: @test_pr54228(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[I_0:%.*]], label [[PH_1:%.*]], label [[LOOP_HEADER:%.*]]
+; CHECK:       ph.1:
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    br i1 [[I_1:%.*]], label [[LOOP_THEN:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK:       loop.then:
+; CHECK-NEXT:    call void @fn()
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C_2]])
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp eq i32 [[B]], 1
+; CHECK-NEXT:    br i1 [[C_3]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[C_4:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_4]]
+;
+entry:
+  br i1 %i.0, label %ph.1, label %loop.header
+
+ph.1:                                             ; preds = %entry
+  %c.1 = icmp eq i32 %a, 0
+  call void @llvm.assume(i1 %c.1)
+  br label %loop.header
+
+loop.header:                                      ; preds = %loop.latch, %ph.1, %entry
+  br i1 %i.1, label %loop.then, label %loop.latch
+
+loop.then:                                        ; preds = %loop.header
+  call void @fn()
+  %c.2 = icmp eq i32 %b, 0
+  call void @llvm.assume(i1 %c.2)
+  br label %loop.latch
+
+loop.latch:                                       ; preds = %loop.then, %loop.header
+  %c.3 = icmp eq i32 %b, 1
+  br i1 %c.3, label %exit, label %loop.header
+
+exit:                                             ; preds = %loop.latch
+  %c.4 = icmp eq i32 %a, 0
+  ret i1 %c.4
+}
+
+declare void @llvm.assume(i1 noundef)
diff --git a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
--- a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
@@ -272,3 +272,84 @@
 }
 
 declare void @use(i1)
+
+define i1 @sub_nuw_i16_simp(i16 %a) {
+; CHECK-LABEL: @sub_nuw_i16_simp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NEG2:%.*]] = sub nuw i16 [[A:%.*]], 305
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 0, [[NEG2]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i16 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 0
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %neg2 = sub nuw i16 %a, 305
+  %c.1 = icmp ugt i16 0, %neg2
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %c.2 = icmp ugt i16 %a, 0
+  ret i1 %c.2
+
+exit.2:
+  %c.3 = icmp ugt i16 %a, 0
+  ret i1 %c.3
+}
+
+define i1 @sub_nuw_i64_simp(i64 %a) {
+; CHECK-LABEL: @sub_nuw_i64_simp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NEG2:%.*]] = sub nuw i64 [[A:%.*]], 305
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i64 0, [[NEG2]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i64 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i64 [[A]], 0
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %neg2 = sub nuw i64 %a, 305
+  %c.1 = icmp ugt i64 0, %neg2
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %c.2 = icmp ugt i64 %a, 0
+  ret i1 %c.2
+
+exit.2:
+  %c.3 = icmp ugt i64 %a, 0
+  ret i1 %c.3
+}
+
+define i1 @sub_nuw_neg_i16(i16 %a) {
+; CHECK-LABEL: @sub_nuw_neg_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NEG2:%.*]] = sub nuw i16 [[A:%.*]], -305
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 0, [[NEG2]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i16 [[A]], 0
+; CHECK-NEXT:    ret i1 false
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 0
+; CHECK-NEXT:    ret i1 [[C_3]]
+;
+entry:
+  %neg2 = sub nuw i16 %a, -305
+  %c.1 = icmp ugt i16 0, %neg2
+  br i1 %c.1, label %exit.1, label %exit.2
+
+exit.1:
+  %c.2 = icmp ugt i16 %a, 0
+  ret i1 %c.2
+
+exit.2:
+  %c.3 = icmp ugt i16 %a, 0
+  ret i1 %c.3
+}
diff --git a/llvm/test/Transforms/Coroutines/coro-elide-musttail.ll b/llvm/test/Transforms/Coroutines/coro-elide-musttail.ll
--- a/llvm/test/Transforms/Coroutines/coro-elide-musttail.ll
+++ b/llvm/test/Transforms/Coroutines/coro-elide-musttail.ll
@@ -13,7 +13,7 @@
 @"bar.resumers" = private constant [3 x void (%"bar.Frame"*)*] [void (%"bar.Frame"*)* @"bar.resume", void (%"bar.Frame"*)* undef, void (%"bar.Frame"*)* undef]
 
 declare dso_local void @"bar"() align 2
-declare dso_local fastcc void @"bar.resume"(%"bar.Frame"*) align 2
+declare dso_local fastcc void @"bar.resume"(%"bar.Frame"* align 8 dereferenceable(24)) align 2
 
 ; There is a musttail call.
 ; With alias analysis, we can tell that the frame does not interfere with CALL34, and hence we can keep the tailcalls.
diff --git a/llvm/test/Transforms/Coroutines/coro-elide-stat.ll b/llvm/test/Transforms/Coroutines/coro-elide-stat.ll
--- a/llvm/test/Transforms/Coroutines/coro-elide-stat.ll
+++ b/llvm/test/Transforms/Coroutines/coro-elide-stat.ll
@@ -17,7 +17,7 @@
 declare void @print(i32) nounwind
 
 ; resume part of the coroutine
-define fastcc void @f.resume(i8*) {
+define fastcc void @f.resume(i8* dereferenceable(1)) {
   tail call void @print(i32 0)
   ret void
 }
diff --git a/llvm/test/Transforms/Coroutines/coro-elide.ll b/llvm/test/Transforms/Coroutines/coro-elide.ll
--- a/llvm/test/Transforms/Coroutines/coro-elide.ll
+++ b/llvm/test/Transforms/Coroutines/coro-elide.ll
@@ -7,7 +7,7 @@
 declare void @print(i32) nounwind
 
 ; resume part of the coroutine
-define fastcc void @f.resume(i8*) {
+define fastcc void @f.resume(i8* dereferenceable(1)) {
   tail call void @print(i32 0)
   ret void
 }
diff --git a/llvm/test/Transforms/Coroutines/coro-heap-elide.ll b/llvm/test/Transforms/Coroutines/coro-heap-elide.ll
--- a/llvm/test/Transforms/Coroutines/coro-heap-elide.ll
+++ b/llvm/test/Transforms/Coroutines/coro-heap-elide.ll
@@ -11,7 +11,7 @@
 
 declare void @bar(i8*)
 
-declare fastcc void @f.resume(%f.frame*)
+declare fastcc void @f.resume(%f.frame* align 4 dereferenceable(4))
 declare fastcc void @f.destroy(%f.frame*)
 declare fastcc void @f.cleanup(%f.frame*)
 
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-opaque-ptr.ll b/llvm/test/Transforms/Coroutines/coro-retcon-opaque-ptr.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-opaque-ptr.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -enable-coroutines -passes='default<O2>' -opaque-pointers -S | FileCheck %s
+
+; Same test as coro-retcon.ll, but with opaque pointers enabled.
+
+define ptr @f(ptr %buffer, i32 %n) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  coro.return:
+; CHECK-NEXT:    store i32 [[N:%.*]], ptr [[BUFFER:%.*]], align 4
+; CHECK-NEXT:    tail call void @print(i32 [[N]])
+; CHECK-NEXT:    ret ptr @f.resume.0
+;
+entry:
+  %id = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr %buffer, ptr @prototype, ptr @allocate, ptr @deallocate)
+  %hdl = call ptr @llvm.coro.begin(token %id, ptr null)
+  br label %loop
+
+loop:                                             ; preds = %resume, %entry
+  %n.val = phi i32 [ %n, %entry ], [ %inc, %resume ]
+  call void @print(i32 %n.val)
+  %unwind0 = call i1 (...) @llvm.coro.suspend.retcon.i1()
+  br i1 %unwind0, label %cleanup, label %resume
+
+resume:                                           ; preds = %loop
+  %inc = add i32 %n.val, 1
+  br label %loop
+
+cleanup:                                          ; preds = %loop
+  %0 = call i1 @llvm.coro.end(ptr %hdl, i1 false)
+  unreachable
+}
+
+define i32 @main() {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca [8 x i8], align 4
+; CHECK-NEXT:    store i32 4, ptr [[TMP0]], align 4
+; CHECK-NEXT:    call void @print(i32 4)
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
+; CHECK-NEXT:    [[N_VAL_RELOAD_I:%.*]] = load i32, ptr [[TMP0]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[INC_I:%.*]] = add i32 [[N_VAL_RELOAD_I]], 1
+; CHECK-NEXT:    store i32 [[INC_I]], ptr [[TMP0]], align 4, !alias.scope !0
+; CHECK-NEXT:    call void @print(i32 [[INC_I]]), !noalias !0
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+; CHECK-NEXT:    [[N_VAL_RELOAD_I1:%.*]] = load i32, ptr [[TMP0]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[INC_I2:%.*]] = add i32 [[N_VAL_RELOAD_I1]], 1
+; CHECK-NEXT:    call void @print(i32 [[INC_I2]]), !noalias !3
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %0 = alloca [8 x i8], align 4
+  %prepare = call ptr @llvm.coro.prepare.retcon(ptr @f)
+  %cont0 = call ptr %prepare(ptr %0, i32 4)
+  %cont1 = call ptr %cont0(ptr %0, i1 zeroext false)
+  %cont2 = call ptr %cont1(ptr %0, i1 zeroext false)
+  %1 = call ptr %cont2(ptr %0, i1 zeroext true)
+  ret i32 0
+}
+
+define hidden { ptr, ptr } @g(ptr %buffer, ptr %ptr) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:  coro.return:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr @allocate(i32 8)
+; CHECK-NEXT:    store ptr [[TMP0]], ptr [[BUFFER:%.*]], align 8
+; CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } { ptr @g.resume.0, ptr undef }, ptr [[PTR]], 1
+; CHECK-NEXT:    ret { ptr, ptr } [[TMP1]]
+;
+entry:
+  %id = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr %buffer, ptr @g_prototype, ptr @allocate, ptr @deallocate)
+  %hdl = call ptr @llvm.coro.begin(token %id, ptr null)
+  br label %loop
+
+loop:                                             ; preds = %resume, %entry
+  %unwind0 = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr %ptr)
+  br i1 %unwind0, label %cleanup, label %resume
+
+resume:                                           ; preds = %loop
+  br label %loop
+
+cleanup:                                          ; preds = %loop
+  %0 = call i1 @llvm.coro.end(ptr %hdl, i1 false)
+  unreachable
+}
+
+declare token @llvm.coro.id.retcon(i32, i32, i8*, i8*, i8*, i8*)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.suspend.retcon.i1(...)
+declare i1 @llvm.coro.end(i8*, i1)
+declare i8* @llvm.coro.prepare.retcon(i8*)
+
+declare i8* @prototype(i8*, i1 zeroext)
+declare {i8*,i8*} @g_prototype(i8*, i1 zeroext)
+
+declare noalias i8* @allocate(i32 %size)
+declare void @deallocate(i8* %ptr)
+
+declare void @print(i32)
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/deterministic-scev-verify.ll b/llvm/test/Transforms/IndVarSimplify/X86/deterministic-scev-verify.ll
--- a/llvm/test/Transforms/IndVarSimplify/X86/deterministic-scev-verify.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/deterministic-scev-verify.ll
@@ -1,18 +1,13 @@
-; RUN: opt -indvars -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS
-; RUN: opt -indvars -S < %s | FileCheck %s --check-prefix=IR
-; REQUIRES: asserts
+; RUN: opt -indvars -S < %s | FileCheck %s
 
 ; Check that IndVarSimplify's result is not influenced by stray calls to
 ; ScalarEvolution in debug builds. However, -verify-indvars may still do
 ; such calls.
 ; llvm.org/PR44815
 
-; STATS: 1 scalar-evolution - Number of loops with trip counts computed by force
-; STATS: 2 scalar-evolution - Number of loops with predictable loop counts
-
 ; In this test, adding -verify-indvars causes %tmp13 to not be optimized away.
-; IR-LABEL: @foo
-; IR-NOT:   phi i32
+; CHECK-LABEL: @foo
+; CHECK-NOT:   phi i32
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll
--- a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes='loop(indvars),verify<scalar-evolution>' %s | FileCheck %s
+; RUN: opt -S -indvars -verify-scev %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/LICM/loop-sink-phi-in-preheader.ll b/llvm/test/Transforms/LICM/loop-sink-phi-in-preheader.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LICM/loop-sink-phi-in-preheader.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=loop-sink %s | FileCheck %s
+
+define void @preheader_of_inner_has_phi() !prof !0 {
+; CHECK-LABEL: @preheader_of_inner_has_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[PH_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[PH_2:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    br label [[INNER:%.*]]
+; CHECK:       inner:
+; CHECK-NEXT:    [[PH_2]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ 0, [[INNER]] ]
+; CHECK-NEXT:    br i1 false, label [[OUTER_LATCH]], label [[INNER]], !prof [[PROF1:![0-9]+]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %ph.1 = phi i32 [ 0, %entry ], [ %ph.2, %outer.latch ]
+  br label %inner
+
+inner:
+  %ph.2 = phi i32 [ 0, %outer.header ], [ 0, %inner ]
+  br i1 false, label %outer.latch, label %inner, !prof !1
+
+outer.latch:
+  br label %outer.header
+}
+
+!0 = !{!"function_entry_count", i64 549102}
+!1 = !{!"branch_weights", i32 4027913, i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -12,12 +12,12 @@
 entry:
   br label %for.body
 
-; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
-; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16-LABEL: Checking a loop in 'i8_factor_2'
 ; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
@@ -43,17 +43,17 @@
 entry:
   br label %for.body
 
-; VF_4-LABEL: Checking a loop in "i16_factor_2"
+; VF_4-LABEL: Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16-LABEL: Checking a loop in 'i16_factor_2'
 ; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
@@ -79,22 +79,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16-LABEL: Checking a loop in 'i32_factor_2'
 ; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
@@ -120,22 +120,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16-LABEL: Checking a loop in 'i64_factor_2'
 ; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
@@ -167,7 +167,7 @@
 ; stores do not form a legal interleaved group because the group would contain
 ; gaps.
 ;
-; VF_2-LABEL: Checking a loop in "i64_factor_8"
+; VF_2-LABEL: Checking a loop in 'i64_factor_8'
 ; VF_2:         Found an estimated cost of 10 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -4,7 +4,7 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; CHECK-COST: Checking a loop in "fixed_width"
+; CHECK-COST: Checking a loop in 'fixed_width'
 ; CHECK-COST: Found an estimated cost of 11 for VF 2 For instruction:   store i32 2, i32* %arrayidx1, align 4
 ; CHECK-COST: Found an estimated cost of 25 for VF 4 For instruction:   store i32 2, i32* %arrayidx1, align 4
 ; CHECK-COST: Selecting VF: 1.
@@ -45,7 +45,7 @@
 }
 
 
-; CHECK-COST: Checking a loop in "scalable"
+; CHECK-COST: Checking a loop in 'scalable'
 ; CHECK-COST: Found an estimated cost of 2 for VF vscale x 4 For instruction:   store i32 2, i32* %arrayidx1, align 4
 
 define void @scalable(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -7,7 +7,7 @@
 ; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16
 ; (maximized bandwidth for i8 in the loop).
 define void @test0(i32* %a, i8* %b, i32* %c) #0 {
-; CHECK: LV: Checking a loop in "test0"
+; CHECK: LV: Checking a loop in 'test0'
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
 ; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
@@ -38,7 +38,7 @@
 ; Test that the MaxVF for the following loop, with a dependence distance
 ; of 64 elements, is calculated as (maxvscale = 16) * 4.
 define void @test1(i32* %a, i8* %b) #0 {
-; CHECK: LV: Checking a loop in "test1"
+; CHECK: LV: Checking a loop in 'test1'
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
 ; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
@@ -70,7 +70,7 @@
 ; Test that the MaxVF for the following loop, with a dependence distance
 ; of 32 elements, is calculated as (maxvscale = 16) * 2.
 define void @test2(i32* %a, i8* %b) #0 {
-; CHECK: LV: Checking a loop in "test2"
+; CHECK: LV: Checking a loop in 'test2'
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
 ; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
@@ -102,7 +102,7 @@
 ; Test that the MaxVF for the following loop, with a dependence distance
 ; of 16 elements, is calculated as (maxvscale = 16) * 1.
 define void @test3(i32* %a, i8* %b) #0 {
-; CHECK: LV: Checking a loop in "test3"
+; CHECK: LV: Checking a loop in 'test3'
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
@@ -134,7 +134,7 @@
 ; Test the fallback mechanism when scalable vectors are not feasible due
 ; to e.g. dependence distance.
 define void @test4(i32* %a, i32* %b) #0 {
-; CHECK: LV: Checking a loop in "test4"
+; CHECK: LV: Checking a loop in 'test4'
 ; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
 ; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -37,7 +37,7 @@
 ; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
 ; fixed-width vectorization is used instead.
 
-; CHECK-DBG: LV: Checking a loop in "test1"
+; CHECK-DBG: LV: Checking a loop in 'test1'
 ; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
 ; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible.
@@ -82,7 +82,7 @@
 ;   }
 ; }
 
-; CHECK-DBG: LV: Checking a loop in "test2"
+; CHECK-DBG: LV: Checking a loop in 'test2'
 ; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
 ; CHECK-DBG: LV: The max safe fixed VF is: 4.
@@ -132,7 +132,7 @@
 ;
 ; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
 
-; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
+; CHECK-DBG-LABEL: LV: Checking a loop in 'test3'
 ; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
 ; CHECK-DBG: LV: Using user VF vscale x 2.
@@ -181,7 +181,7 @@
 ;
 ; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize.
 
-; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
+; CHECK-DBG-LABEL: LV: Checking a loop in 'test4'
 ; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
 ; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
@@ -232,7 +232,7 @@
 ;
 ; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
 
-; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
+; CHECK-DBG-LABEL: LV: Checking a loop in 'test5'
 ; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
 ; CHECK-DBG: LV: Using user VF vscale x 4
@@ -280,7 +280,7 @@
 ;
 ; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize.
 
-; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
+; CHECK-DBG-LABEL: LV: Checking a loop in 'test6'
 ; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
 ; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF.
@@ -315,7 +315,7 @@
 !16 = !{!"llvm.loop.vectorize.width", i32 16}
 !17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 
-; CHECK-NO-SVE-REMARKS-LABEL: LV: Checking a loop in "test_no_sve"
+; CHECK-NO-SVE-REMARKS-LABEL: LV: Checking a loop in 'test_no_sve'
 ; CHECK-NO-SVE-REMARKS: LV: User VF=vscale x 4 is ignored because scalable vectors are not available.
 ; CHECK-NO-SVE-REMARKS: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is ignored because the target does not support scalable vectors. The compiler will pick a more suitable value.
 ; CHECK-NO-SVE-REMARKS: LV: Selecting VF: 4.
@@ -349,7 +349,7 @@
 ; Test the LV falls back to fixed-width vectorization if scalable vectors are
 ; supported but max vscale is undefined.
 ;
-; CHECK-DBG-LABEL: LV: Checking a loop in "test_no_max_vscale"
+; CHECK-DBG-LABEL: LV: Checking a loop in 'test_no_max_vscale'
 ; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: The max safe fixed VF is: 4.
 ; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
-; CHECK-LABEL: Checking a loop in "interleaved_access"
+; CHECK-LABEL: Checking a loop in 'interleaved_access'
 ; CHECK:         The Smallest and Widest types: 64 / 64 bits
 ;
 define void @interleaved_access(i8** %A, i64 %N) {
@@ -36,7 +36,7 @@
 ; determined by looking through the recurrences, which allows a sensible VF to be
 ; chosen. The following 3 cases check different combinations of widths.
 
-; CHECK-LABEL: Checking a loop in "no_loads_stores_32"
+; CHECK-LABEL: Checking a loop in 'no_loads_stores_32'
 ; CHECK: The Smallest and Widest types: 4294967295 / 32 bits
 ; CHECK: Selecting VF: 4
 
@@ -59,7 +59,7 @@
   ret double %.lcssa
 }
 
-; CHECK-LABEL: Checking a loop in "no_loads_stores_16"
+; CHECK-LABEL: Checking a loop in 'no_loads_stores_16'
 ; CHECK: The Smallest and Widest types: 4294967295 / 16 bits
 ; CHECK: Selecting VF: 8
 
@@ -81,7 +81,7 @@
   ret double %.lcssa
 }
 
-; CHECK-LABEL: Checking a loop in "no_loads_stores_8"
+; CHECK-LABEL: Checking a loop in 'no_loads_stores_8'
 ; CHECK: The Smallest and Widest types: 4294967295 / 8 bits
 ; CHECK: Selecting VF: 16
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -7,11 +7,11 @@
 
 target triple = "aarch64-linux-gnu"
 
-; DEBUG: LV: Checking a loop in "main_vf_vscale_x_16"
+; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16'
 ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
 ; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1
 
-; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_16"
+; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_16'
 ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
 ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
 ; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
@@ -194,11 +194,11 @@
 }
 
 
-; DEBUG: LV: Checking a loop in "main_vf_vscale_x_2"
+; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_2'
 ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
 ; DEBUG: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
 
-; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_2"
+; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_2'
 ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
 ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
 ; DEBUG-FORCED: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
@@ -3,7 +3,7 @@
 
 target triple="aarch64--linux-gnu"
 
-; CHECK: LV: Checking a loop in "gather_nxv4i32_loaded_index"
+; CHECK: LV: Checking a loop in 'gather_nxv4i32_loaded_index'
 ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction:   %1 = load float, float* %arrayidx3, align 4
 define void @gather_nxv4i32_loaded_index(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 {
 entry:
@@ -25,7 +25,7 @@
   ret void
 }
 
-; CHECK: LV: Checking a loop in "scatter_nxv4i32_loaded_index"
+; CHECK: LV: Checking a loop in 'scatter_nxv4i32_loaded_index'
 ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction:   store float %1, float* %arrayidx5, align 4
 define void @scatter_nxv4i32_loaded_index(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 {
 entry:
@@ -49,7 +49,7 @@
 
 ; NOTE: For runtime-determined strides the vectoriser versions the loop and adds SCEV checks
 ; to ensure the stride value is always 1. Therefore, it can assume a contiguous load and a cost of 1.
-; CHECK: LV: Checking a loop in "gather_nxv4i32_unknown_stride"
+; CHECK: LV: Checking a loop in 'gather_nxv4i32_unknown_stride'
 ; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %0 = load float, float* %arrayidx, align 4
 define void @gather_nxv4i32_unknown_stride(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %stride, i64 %n) #0 {
 entry:
@@ -72,7 +72,7 @@
 
 ; NOTE: For runtime-determined strides the vectoriser versions the loop and adds SCEV checks
 ; to ensure the stride value is always 1. Therefore, it can assume a contiguous load and cost is 1.
-; CHECK: LV: Checking a loop in "scatter_nxv4i32_unknown_stride"
+; CHECK: LV: Checking a loop in 'scatter_nxv4i32_unknown_stride'
 ; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   store float %0, float* %arrayidx2, align 4
 define void @scatter_nxv4i32_unknown_stride(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %stride, i64 %n) #0 {
 entry:
@@ -93,7 +93,7 @@
   ret void
 }
 
-; CHECK: LV: Checking a loop in "gather_nxv4i32_stride2"
+; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride2'
 ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction:   %0 = load float, float* %arrayidx, align 4
 define void @gather_nxv4i32_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
 entry:
@@ -114,7 +114,7 @@
   ret void
 }
 
-; CHECK: LV: Checking a loop in "scatter_nxv4i32_stride2"
+; CHECK: LV: Checking a loop in 'scatter_nxv4i32_stride2'
 ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction:   store float %0, float* %arrayidx2, align 4
 define void @scatter_nxv4i32_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
 entry:
@@ -136,7 +136,7 @@
 }
 
 
-; CHECK: LV: Checking a loop in "gather_nxv4i32_stride64"
+; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride64'
 ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction:   %0 = load float, float* %arrayidx, align 4
 define void @gather_nxv4i32_stride64(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
 entry:
@@ -157,7 +157,7 @@
   ret void
 }
 
-; CHECK: LV: Checking a loop in "scatter_nxv4i32_stride64"
+; CHECK: LV: Checking a loop in 'scatter_nxv4i32_stride64'
 ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction:   store float %0, float* %arrayidx2, align 4
 define void @scatter_nxv4i32_stride64(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -4,7 +4,7 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; CHECK-LABEL:  LV: Checking a loop in "pointer_induction_used_as_vector"
+; CHECK-LABEL:  LV: Checking a loop in 'pointer_induction_used_as_vector'
 ; CHECK-NOT:    LV: Found {{.*}} scalar instruction:   %ptr.iv.2.next = getelementptr inbounds i8, i8* %ptr.iv.2, i64 1
 ;
 ; CHECK:        VPlan 'Initial VPlan for VF={vscale x 2},UF>=1' {
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -14,7 +14,7 @@
 ; are allowed, even without -ffast-math.
 
 ; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "sumi"
+; CHECK: Checking a loop in 'sumi'
 ; CHECK: We can vectorize this loop!
 define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
 entry:
@@ -45,11 +45,11 @@
 }
 
 ; Floating-point loops need fast-math to be vectorizeable
-; LINUX: Checking a loop in "sumf"
+; LINUX: Checking a loop in 'sumf'
 ; LINUX: Potentially unsafe FP op prevents vectorization
-; MVE: Checking a loop in "sumf"
+; MVE: Checking a loop in 'sumf'
 ; MVE: We can vectorize this loop!
-; DARWIN: Checking a loop in "sumf"
+; DARWIN: Checking a loop in 'sumf'
 ; DARWIN: We can vectorize this loop!
 define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 entry:
@@ -80,7 +80,7 @@
 }
 
 ; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "redi"
+; CHECK: Checking a loop in 'redi'
 ; CHECK: We can vectorize this loop!
 define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
 entry:
@@ -113,11 +113,11 @@
 }
 
 ; Floating-point loops need fast-math to be vectorizeable
-; LINUX: Checking a loop in "redf"
+; LINUX: Checking a loop in 'redf'
 ; LINUX: Potentially unsafe FP op prevents vectorization
-; MVE: Checking a loop in "redf"
+; MVE: Checking a loop in 'redf'
 ; MVE: We can vectorize this loop!
-; DARWIN: Checking a loop in "redf"
+; DARWIN: Checking a loop in 'redf'
 ; DARWIN: We can vectorize this loop!
 define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
 entry:
@@ -150,9 +150,9 @@
 }
 
 ; Make sure calls that turn into builtins are also covered
-; LINUX: Checking a loop in "fabs"
+; LINUX: Checking a loop in 'fabs'
 ; LINUX: Potentially unsafe FP op prevents vectorization
-; DARWIN: Checking a loop in "fabs"
+; DARWIN: Checking a loop in 'fabs'
 ; DARWIN: We can vectorize this loop!
 define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 entry:
@@ -178,7 +178,7 @@
 }
 
 ; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "sumi_fast"
+; CHECK: Checking a loop in 'sumi_fast'
 ; CHECK: We can vectorize this loop!
 define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
 entry:
@@ -209,7 +209,7 @@
 }
 
 ; Floating-point loops can be vectorizeable with fast-math
-; CHECK: Checking a loop in "sumf_fast"
+; CHECK: Checking a loop in 'sumf_fast'
 ; CHECK: We can vectorize this loop!
 define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 entry:
@@ -240,7 +240,7 @@
 }
 
 ; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "redi_fast"
+; CHECK: Checking a loop in 'redi_fast'
 ; CHECK: We can vectorize this loop!
 define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
 entry:
@@ -273,7 +273,7 @@
 }
 
 ; Floating-point loops can be vectorizeable with fast-math
-; CHECK: Checking a loop in "redf_fast"
+; CHECK: Checking a loop in 'redf_fast'
 ; CHECK: We can vectorize this loop!
 define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
 entry:
@@ -306,7 +306,7 @@
 }
 
 ; Make sure calls that turn into builtins are also covered
-; CHECK: Checking a loop in "fabs_fast"
+; CHECK: Checking a loop in 'fabs_fast'
 ; CHECK: We can vectorize this loop!
 define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -12,12 +12,12 @@
 entry:
   br label %for.body
 
-; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
-; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16-LABEL: Checking a loop in 'i8_factor_2'
 ; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
@@ -43,17 +43,17 @@
 entry:
   br label %for.body
 
-; VF_4-LABEL:  Checking a loop in "i16_factor_2"
+; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16-LABEL: Checking a loop in 'i16_factor_2'
 ; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
@@ -79,22 +79,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16-LABEL: Checking a loop in 'i32_factor_2'
 ; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
@@ -120,12 +120,12 @@
 entry:
   br label %for.body
 
-; VF_4-LABEL: Checking a loop in "half_factor_2"
+; VF_4-LABEL: Checking a loop in 'half_factor_2'
 ; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
-; VF_8-LABEL: Checking a loop in "half_factor_2"
+; VF_8-LABEL: Checking a loop in 'half_factor_2'
 ; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
 ; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
 ; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -14,22 +14,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i8_factor_2"
+; VF_2-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
-; VF_4-LABEL:  Checking a loop in "i8_factor_2"
+; VF_4-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
-; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
-; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16-LABEL: Checking a loop in 'i8_factor_2'
 ; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
@@ -55,22 +55,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i16_factor_2"
+; VF_2-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_4-LABEL:  Checking a loop in "i16_factor_2"
+; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16-LABEL: Checking a loop in 'i16_factor_2'
 ; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
@@ -96,22 +96,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16-LABEL: Checking a loop in 'i32_factor_2'
 ; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
@@ -137,22 +137,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_2:          Found an estimated cost of 44 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_4:          Found an estimated cost of 88 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_8:          Found an estimated cost of 176 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
-; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16-LABEL: Checking a loop in 'i64_factor_2'
 ; VF_16:         Found an estimated cost of 352 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
@@ -178,22 +178,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f16_factor_2"
+; VF_2-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load half, half* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, half* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
-; VF_4-LABEL:  Checking a loop in "f16_factor_2"
+; VF_4-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_4:          Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 16 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
-; VF_8-LABEL:  Checking a loop in "f16_factor_2"
+; VF_8-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
-; VF_16-LABEL: Checking a loop in "f16_factor_2"
+; VF_16-LABEL: Checking a loop in 'f16_factor_2'
 ; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load half, half* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load half, half* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
@@ -219,22 +219,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f32_factor_2"
+; VF_2-LABEL:  Checking a loop in 'f32_factor_2'
 ; VF_2:          Found an estimated cost of 10 for VF 2 For instruction: %tmp2 = load float, float* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load float, float* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
-; VF_4-LABEL:  Checking a loop in "f32_factor_2"
+; VF_4-LABEL:  Checking a loop in 'f32_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load float, float* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load float, float* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
-; VF_8-LABEL:  Checking a loop in "f32_factor_2"
+; VF_8-LABEL:  Checking a loop in 'f32_factor_2'
 ; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load float, float* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load float, float* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
-; VF_16-LABEL: Checking a loop in "f32_factor_2"
+; VF_16-LABEL: Checking a loop in 'f32_factor_2'
 ; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load float, float* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load float, float* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
@@ -260,22 +260,22 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f64_factor_2"
+; VF_2-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load double, double* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, double* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 8 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
-; VF_4-LABEL:  Checking a loop in "f64_factor_2"
+; VF_4-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp2 = load double, double* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, double* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 16 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
-; VF_8-LABEL:  Checking a loop in "f64_factor_2"
+; VF_8-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp2 = load double, double* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, double* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 32 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
-; VF_16-LABEL: Checking a loop in "f64_factor_2"
+; VF_16-LABEL: Checking a loop in 'f64_factor_2'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp2 = load double, double* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, double* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
@@ -305,28 +305,28 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i8_factor_3"
+; VF_2-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, i8* %tmp2, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1
-; VF_4-LABEL:  Checking a loop in "i8_factor_3"
+; VF_4-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, i8* %tmp2, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1
-; VF_8-LABEL:  Checking a loop in "i8_factor_3"
+; VF_8-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, i8* %tmp2, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1
-; VF_16-LABEL: Checking a loop in "i8_factor_3"
+; VF_16-LABEL: Checking a loop in 'i8_factor_3'
 ; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp2, align 1
@@ -357,28 +357,28 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i16_factor_3"
+; VF_2-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, i16* %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2
-; VF_4-LABEL:  Checking a loop in "i16_factor_3"
+; VF_4-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, i16* %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2
-; VF_8-LABEL:  Checking a loop in "i16_factor_3"
+; VF_8-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2
-; VF_16-LABEL: Checking a loop in "i16_factor_3"
+; VF_16-LABEL: Checking a loop in 'i16_factor_3'
 ; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp2, align 2
@@ -409,28 +409,28 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i32_factor_3"
+; VF_2-LABEL:  Checking a loop in 'i32_factor_3'
 ; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i32, i32* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, i32* %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_4-LABEL:  Checking a loop in "i32_factor_3"
+; VF_4-LABEL:  Checking a loop in 'i32_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp2, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_8-LABEL:  Checking a loop in "i32_factor_3"
+; VF_8-LABEL:  Checking a loop in 'i32_factor_3'
 ; VF_8:          Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_16-LABEL: Checking a loop in "i32_factor_3"
+; VF_16-LABEL: Checking a loop in 'i32_factor_3'
 ; VF_16:         Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp2, align 4
@@ -461,28 +461,28 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i64_factor_3"
+; VF_2-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_2:          Found an estimated cost of 66 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8
-; VF_4-LABEL:  Checking a loop in "i64_factor_3"
+; VF_4-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_4:          Found an estimated cost of 132 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 36 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8
-; VF_8-LABEL:  Checking a loop in "i64_factor_3"
+; VF_8-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_8:          Found an estimated cost of 264 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 72 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8
-; VF_16-LABEL: Checking a loop in "i64_factor_3"
+; VF_16-LABEL: Checking a loop in 'i64_factor_3'
 ; VF_16:         Found an estimated cost of 528 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
@@ -513,28 +513,28 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f16_factor_3"
+; VF_2-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_2:          Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load half, half* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, half* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, half* %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2
-; VF_4-LABEL:  Checking a loop in "f16_factor_3"
+; VF_4-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_4:          Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, half* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, half* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, half* %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2
-; VF_8-LABEL:  Checking a loop in "f16_factor_3"
+; VF_8-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_8:          Found an estimated cost of 56 for VF 8 For instruction: %tmp3 = load half, half* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load half, half* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2
-; VF_16-LABEL: Checking a loop in "f16_factor_3"
+; VF_16-LABEL: Checking a loop in 'f16_factor_3'
 ; VF_16:         Found an estimated cost of 112 for VF 16 For instruction: %tmp3 = load half, half* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load half, half* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp2, align 2
@@ -565,28 +565,28 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f32_factor_3"
+; VF_2-LABEL:  Checking a loop in 'f32_factor_3'
 ; VF_2:          Found an estimated cost of 16 for VF 2 For instruction: %tmp3 = load float, float* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load float, float* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, float* %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_4-LABEL:  Checking a loop in "f32_factor_3"
+; VF_4-LABEL:  Checking a loop in 'f32_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load float, float* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, float* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp2, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_8-LABEL:  Checking a loop in "f32_factor_3"
+; VF_8-LABEL:  Checking a loop in 'f32_factor_3'
 ; VF_8:          Found an estimated cost of 64 for VF 8 For instruction: %tmp3 = load float, float* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, float* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_16-LABEL: Checking a loop in "f32_factor_3"
+; VF_16-LABEL: Checking a loop in 'f32_factor_3'
 ; VF_16:         Found an estimated cost of 128 for VF 16 For instruction: %tmp3 = load float, float* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load float, float* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp2, align 4
@@ -617,28 +617,28 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f64_factor_3"
+; VF_2-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_2:          Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load double, double* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, double* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, double* %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8
-; VF_4-LABEL:  Checking a loop in "f64_factor_3"
+; VF_4-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_4:          Found an estimated cost of 36 for VF 4 For instruction: %tmp3 = load double, double* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, double* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, double* %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8
-; VF_8-LABEL:  Checking a loop in "f64_factor_3"
+; VF_8-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_8:          Found an estimated cost of 72 for VF 8 For instruction: %tmp3 = load double, double* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, double* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, double* %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8
-; VF_16-LABEL: Checking a loop in "f64_factor_3"
+; VF_16-LABEL: Checking a loop in 'f64_factor_3'
 ; VF_16:         Found an estimated cost of 144 for VF 16 For instruction: %tmp3 = load double, double* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, double* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, double* %tmp2, align 8
@@ -672,7 +672,7 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i8_factor_4"
+; VF_2-LABEL:  Checking a loop in 'i8_factor_4'
 ; VF_2:          Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
@@ -681,7 +681,7 @@
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1
 ; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store i8 0, i8* %tmp3, align 1
-; VF_4-LABEL: Checking a loop in "i8_factor_4"
+; VF_4-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_4:         Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
@@ -690,7 +690,7 @@
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1
 ; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store i8 0, i8* %tmp3, align 1
-; VF_8-LABEL:  Checking a loop in "i8_factor_4"
+; VF_8-LABEL:  Checking a loop in 'i8_factor_4'
 ; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
@@ -699,7 +699,7 @@
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1
 ; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1
-; VF_16-LABEL: Checking a loop in "i8_factor_4"
+; VF_16-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
@@ -735,7 +735,7 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i16_factor_4"
+; VF_2-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_2:          Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
@@ -744,7 +744,7 @@
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store i16 0, i16* %tmp3, align 2
-; VF_4-LABEL:  Checking a loop in "i16_factor_4"
+; VF_4-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_4:          Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
@@ -753,7 +753,7 @@
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2
-; VF_8-LABEL:  Checking a loop in "i16_factor_4"
+; VF_8-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
@@ -762,7 +762,7 @@
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2
-; VF_16-LABEL: Checking a loop in "i16_factor_4"
+; VF_16-LABEL: Checking a loop in 'i16_factor_4'
 ; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
@@ -798,7 +798,7 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i32_factor_4"
+; VF_2-LABEL:  Checking a loop in 'i32_factor_4'
 ; VF_2:          Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
@@ -807,7 +807,7 @@
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4
-; VF_4-LABEL:  Checking a loop in "i32_factor_4"
+; VF_4-LABEL:  Checking a loop in 'i32_factor_4'
 ; VF_4:          Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
@@ -816,7 +816,7 @@
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4
 ; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4
-; VF_8-LABEL:  Checking a loop in "i32_factor_4"
+; VF_8-LABEL:  Checking a loop in 'i32_factor_4'
 ; VF_8:          Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
@@ -825,7 +825,7 @@
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4
-; VF_16-LABEL: Checking a loop in "i32_factor_4"
+; VF_16-LABEL: Checking a loop in 'i32_factor_4'
 ; VF_16:         Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
@@ -861,7 +861,7 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "i64_factor_4"
+; VF_2-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_2:          Found an estimated cost of 88 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
@@ -870,7 +870,7 @@
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i64 0, i64* %tmp3, align 8
-; VF_4-LABEL:  Checking a loop in "i64_factor_4"
+; VF_4-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_4:          Found an estimated cost of 176 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
@@ -879,7 +879,7 @@
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 48 for VF 4 For instruction: store i64 0, i64* %tmp3, align 8
-; VF_8-LABEL:  Checking a loop in "i64_factor_4"
+; VF_8-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_8:          Found an estimated cost of 352 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
@@ -888,7 +888,7 @@
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 96 for VF 8 For instruction: store i64 0, i64* %tmp3, align 8
-; VF_16-LABEL: Checking a loop in "i64_factor_4"
+; VF_16-LABEL: Checking a loop in 'i64_factor_4'
 ; VF_16:         Found an estimated cost of 704 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
@@ -924,7 +924,7 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f16_factor_4"
+; VF_2-LABEL:  Checking a loop in 'f16_factor_4'
 ; VF_2:          Found an estimated cost of 18 for VF 2 For instruction: %tmp4 = load half, half* %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, half* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load half, half* %tmp2, align 2
@@ -933,7 +933,7 @@
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store half 0xH0000, half* %tmp3, align 2
-; VF_4-LABEL:  Checking a loop in "f16_factor_4"
+; VF_4-LABEL:  Checking a loop in 'f16_factor_4'
 ; VF_4:          Found an estimated cost of 36 for VF 4 For instruction: %tmp4 = load half, half* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, half* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load half, half* %tmp2, align 2
@@ -942,7 +942,7 @@
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2
-; VF_8-LABEL:  Checking a loop in "f16_factor_4"
+; VF_8-LABEL:  Checking a loop in 'f16_factor_4'
 ; VF_8:          Found an estimated cost of 72 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2
@@ -951,7 +951,7 @@
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2
-; VF_16-LABEL: Checking a loop in "f16_factor_4"
+; VF_16-LABEL: Checking a loop in 'f16_factor_4'
 ; VF_16:         Found an estimated cost of 144 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2
@@ -987,7 +987,7 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f32_factor_4"
+; VF_2-LABEL:  Checking a loop in 'f32_factor_4'
 ; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp4 = load float, float* %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, float* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load float, float* %tmp2, align 4
@@ -996,7 +996,7 @@
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4
-; VF_4-LABEL:  Checking a loop in "f32_factor_4"
+; VF_4-LABEL:  Checking a loop in 'f32_factor_4'
 ; VF_4:          Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, float* %tmp2, align 4
@@ -1005,7 +1005,7 @@
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4
 ; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4
-; VF_8-LABEL:  Checking a loop in "f32_factor_4"
+; VF_8-LABEL:  Checking a loop in 'f32_factor_4'
 ; VF_8:          Found an estimated cost of 80 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4
@@ -1014,7 +1014,7 @@
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4
-; VF_16-LABEL: Checking a loop in "f32_factor_4"
+; VF_16-LABEL: Checking a loop in 'f32_factor_4'
 ; VF_16:         Found an estimated cost of 160 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4
@@ -1050,7 +1050,7 @@
 entry:
   br label %for.body
 
-; VF_2-LABEL:  Checking a loop in "f64_factor_4"
+; VF_2-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp4 = load double, double* %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, double* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, double* %tmp2, align 8
@@ -1059,7 +1059,7 @@
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store double 0.000000e+00, double* %tmp3, align 8
-; VF_4-LABEL:  Checking a loop in "f64_factor_4"
+; VF_4-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_4:          Found an estimated cost of 48 for VF 4 For instruction: %tmp4 = load double, double* %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, double* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, double* %tmp2, align 8
@@ -1068,7 +1068,7 @@
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 32 for VF 4 For instruction: store double 0.000000e+00, double* %tmp3, align 8
-; VF_8-LABEL:  Checking a loop in "f64_factor_4"
+; VF_8-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_8:          Found an estimated cost of 96 for VF 8 For instruction: %tmp4 = load double, double* %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, double* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, double* %tmp2, align 8
@@ -1077,7 +1077,7 @@
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 64 for VF 8 For instruction: store double 0.000000e+00, double* %tmp3, align 8
-; VF_16-LABEL: Checking a loop in "f64_factor_4"
+; VF_16-LABEL: Checking a loop in 'f64_factor_4'
 ; VF_16:         Found an estimated cost of 192 for VF 16 For instruction: %tmp4 = load double, double* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, double* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, double* %tmp2, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
@@ -10,7 +10,7 @@
 ; registers. Each of the 4 vector values must then be constructed from the
 ; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
 ;
-; CHECK: LV: Checking a loop in "fun0"
+; CHECK: LV: Checking a loop in 'fun0'
 ; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
 ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
 ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
@@ -48,7 +48,7 @@
 ; 3 vector registers, and then constructing the vector value with two vperms,
 ; which gives a cost of 5.
 ;
-; CHECK: LV: Checking a loop in "fun1"
+; CHECK: LV: Checking a loop in 'fun1'
 ; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
 define void @fun1(i8 *%ptr, i8 *%dst) {
 entry:
@@ -74,7 +74,7 @@
 ; 32. At VF=2, this means loading 2 vector registers, and using 4 vperms to
 ; produce the vector values, which gives a cost of 6.
 ;
-; CHECK: LV: Checking a loop in "fun2"
+; CHECK: LV: Checking a loop in 'fun2'
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
@@ -114,7 +114,7 @@
 ; as in fun2, except the stride makes the second iterations values overlap a
 ; vector register boundary.
 ;
-; CHECK: LV: Checking a loop in "fun3"
+; CHECK: LV: Checking a loop in 'fun3'
 ; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -12,7 +12,7 @@
 
 ; Function Attrs: nounwind readonly uwtable
 define i32 @vect() {
-; CHECK: LV: Checking a loop in "vect"
+; CHECK: LV: Checking a loop in 'vect'
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
@@ -21,7 +21,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: LV: Checking a loop in "test_g"
+; CHECK: LV: Checking a loop in 'test_g'
 ; CHECK: LV(REG): Found max usage: 2 item
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
@@ -63,7 +63,7 @@
   ret i32 %r.0.lcssa, !dbg !38
 }
 
-; CHECK: LV: Checking a loop in "test"
+; CHECK: LV: Checking a loop in 'test'
 ; CHECK: LV(REG): Found max usage: 2 item
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -1,10 +1,10 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: "foo"
+; CHECK: 'foo'
 ; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %shift = ashr i32 %val, %k
 define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 {
-entry:  
+entry:
   br label %body
 
 body:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -9,7 +9,7 @@
 @b = common global [2048 x i32] zeroinitializer, align 16
 @c = common global [2048 x i32] zeroinitializer, align 16
 
-; CHECK: Checking a loop in "scalarselect"
+; CHECK: Checking a loop in 'scalarselect'
 define void @scalarselect(i1 %cond) {
   br label %1
 
@@ -36,7 +36,7 @@
   ret void
 }
 
-; CHECK: Checking a loop in "vectorselect"
+; CHECK: Checking a loop in 'vectorselect'
 define void @vectorselect(i1 %cond) {
   br label %1
 
diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
--- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
@@ -35,7 +35,7 @@
 }
 
 ; Check for crash exposed by D76992.
-; CHECK-LABEL: "test"
+; CHECK-LABEL: 'test'
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
@@ -30,11 +30,11 @@
 ;}
 
 
-; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided1' 
 ; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
 
-; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided1' 
 ; STRIDED_MASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
 ; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
@@ -63,13 +63,13 @@
 ; }
 ;}
 
-; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided2' 
 ; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
 ; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
 ; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
 
-; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided2' 
 ; STRIDED_MASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
@@ -97,11 +97,11 @@
 ;}
 
 
-; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided3' 
 ; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
 
-; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided3' 
 ; STRIDED_MASKED: LV: Analyzing interleaved accesses...
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/loop-legality-checks.ll b/llvm/test/Transforms/LoopVectorize/loop-legality-checks.ll
--- a/llvm/test/Transforms/LoopVectorize/loop-legality-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-legality-checks.ll
@@ -2,7 +2,7 @@
 ; REQUIRES: asserts
 
 ; Make sure LV legal bails out when there is a non-int, non-ptr phi
-; CHECK-LABEL: "invalid_phi_types"
+; CHECK-LABEL: 'invalid_phi_types'
 ; CHECK: LV: Not vectorizing: Found a non-int non-pointer PHI.
 define i32 @invalid_phi_types() {
 entry:
@@ -21,7 +21,7 @@
 
 ; D40973
 ; Make sure LV legal bails out when the loop doesn't have a legal pre-header.
-; CHECK-LABEL: "inc"
+; CHECK-LABEL: 'inc'
 ; CHECK: LV: Not vectorizing: Loop doesn't have a legal pre-header.
 define void @inc(i32 %n, i8* %P) {
   %1 = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/LoopVectorize/nounroll.ll b/llvm/test/Transforms/LoopVectorize/nounroll.ll
--- a/llvm/test/Transforms/LoopVectorize/nounroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/nounroll.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512"
 
-; CHECK: LV: Checking a loop in "f1"
+; CHECK: LV: Checking a loop in 'f1'
 ; CHECK: LV: Loop hints: force=? width=0 interleave=1
 define dso_local void @f1(i32 signext %n, i32* %A) {
 entry:
@@ -30,7 +30,7 @@
   ret void
 }
 
-; CHECK: LV: Checking a loop in "f2"
+; CHECK: LV: Checking a loop in 'f2'
 ; CHECK: LV: Loop hints: force=? width=0 interleave=4
 define dso_local void @f2(i32 signext %n, i32* %A) {
 entry:
@@ -57,7 +57,7 @@
   ret void
 }
 
-; CHECK: LV: Checking a loop in "f3"
+; CHECK: LV: Checking a loop in 'f3'
 ; CHECK: LV: Loop hints: force=? width=0 interleave=1
 define dso_local void @f3(i32 signext %n, i32* %A) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
 
 ; Currently we cannot handle live-out variables that are recurrences.
-; CHECK: LV: Checking a loop in "f2"
+; CHECK: LV: Checking a loop in 'f2'
 ; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate.
 
 define signext i32 @f2(i8* noalias %A, i32 signext %n) {
@@ -38,7 +38,7 @@
 }
 
 ; Currently we cannot handle widended/truncated inductions.
-; CHECK: LV: Checking a loop in "f3"
+; CHECK: LV: Checking a loop in 'f3'
 ; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate.
 
 define void @f3(i8* noalias %A, i32 signext %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
 
 ; Currently we cannot handle scalable vectorization factors.
-; CHECK: LV: Checking a loop in "f1"
+; CHECK: LV: Checking a loop in 'f1'
 ; CHECK: LEV: Epilogue vectorization factor is forced.
 ; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1
 
diff --git a/llvm/test/Transforms/LoopVectorize/pr39099.ll b/llvm/test/Transforms/LoopVectorize/pr39099.ll
--- a/llvm/test/Transforms/LoopVectorize/pr39099.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39099.ll
@@ -6,7 +6,7 @@
 ; Ensure that we don't create interleave groups for predicated
 ; strided accesses. 
 
-; CHECK: LV: Checking a loop in "masked_strided"
+; CHECK: LV: Checking a loop in 'masked_strided'
 ; CHECK: LV: Analyzing interleaved accesses...
 ; CHECK-NOT: LV: Creating an interleave group
 
diff --git a/llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll b/llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
--- a/llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
+++ b/llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -O2 -force-vector-interleave=2 -force-vector-width=4 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: LV: Checking a loop in "foo"
+; CHECK: LV: Checking a loop in 'foo'
 ; CHECK: LV: Loop hints: force=enabled
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -7,7 +7,7 @@
 ; Tests for printing VPlans.
 
 define void @print_call_and_memory(i64 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
-; CHECK-LABEL: Checking a loop in "print_call_and_memory"
+; CHECK-LABEL: Checking a loop in 'print_call_and_memory'
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -47,7 +47,7 @@
 }
 
 define void @print_widen_gep_and_select(i64 %n, float* noalias %y, float* noalias %x, float* %z) nounwind uwtable {
-; CHECK-LABEL: Checking a loop in "print_widen_gep_and_select"
+; CHECK-LABEL: Checking a loop in 'print_widen_gep_and_select'
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -91,7 +91,7 @@
 }
 
 define float @print_reduction(i64 %n, float* noalias %y) {
-; CHECK-LABEL: Checking a loop in "print_reduction"
+; CHECK-LABEL: Checking a loop in 'print_reduction'
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -128,7 +128,7 @@
 }
 
 define void @print_replicate_predicated_phi(i64 %n, i64* %x) {
-; CHECK-LABEL: Checking a loop in "print_replicate_predicated_phi"
+; CHECK-LABEL: Checking a loop in 'print_replicate_predicated_phi'
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -201,7 +201,7 @@
 @CD = common global [1024 x i32] zeroinitializer, align 4
 
 define void @print_interleave_groups(i32 %C, i32 %D) {
-; CHECK-LABEL: Checking a loop in "print_interleave_groups"
+; CHECK-LABEL: Checking a loop in 'print_interleave_groups'
 ; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -261,7 +261,7 @@
 }
 
 define float @print_fmuladd_strict(float* %a, float* %b, i64 %n) {
-; CHECK-LABEL: Checking a loop in "print_fmuladd_strict"
+; CHECK-LABEL: Checking a loop in 'print_fmuladd_strict'
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -301,7 +301,7 @@
 }
 
 define void @debug_loc_vpinstruction(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {
-; CHECK-LABEL: Checking a loop in "debug_loc_vpinstruction"
+; CHECK-LABEL: Checking a loop in 'debug_loc_vpinstruction'
 ; CHECK:    VPlan 'Initial VPlan for VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -9,7 +9,7 @@
 @c = common global [2048 x i32] zeroinitializer, align 16
 
 
-; CHECK-LABEL: LV: Checking a loop in "sink1"
+; CHECK-LABEL: LV: Checking a loop in 'sink1'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -75,7 +75,7 @@
   ret void
 }
 
-; CHECK-LABEL: LV: Checking a loop in "sink2"
+; CHECK-LABEL: LV: Checking a loop in 'sink2'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -154,7 +154,7 @@
   ret void
 }
 
-; CHECK-LABEL: LV: Checking a loop in "sink3"
+; CHECK-LABEL: LV: Checking a loop in 'sink3'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -235,7 +235,7 @@
 
 ; Make sure we do not sink uniform instructions.
 define void @uniform_gep(i64 %k, i16* noalias %A, i16* noalias %B) {
-; CHECK-LABEL: LV: Checking a loop in "uniform_gep"
+; CHECK-LABEL: LV: Checking a loop in 'uniform_gep'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -325,7 +325,7 @@
 
 ; Loop with predicated load.
 define void @pred_cfg1(i32 %k, i32 %j) {
-; CHECK-LABEL: LV: Checking a loop in "pred_cfg1"
+; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -426,7 +426,7 @@
 ; Loop with predicated load and store in separate blocks, store depends on
 ; loaded value.
 define void @pred_cfg2(i32 %k, i32 %j) {
-; CHECK-LABEL: LV: Checking a loop in "pred_cfg2"
+; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -542,7 +542,7 @@
 ; Loop with predicated load and store in separate blocks, store does not depend
 ; on loaded value.
 define void @pred_cfg3(i32 %k, i32 %j) {
-; CHECK-LABEL: LV: Checking a loop in "pred_cfg3"
+; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -652,7 +652,7 @@
 }
 
 define void @merge_3_replicate_region(i32 %k, i32 %j) {
-; CHECK-LABEL: LV: Checking a loop in "merge_3_replicate_region"
+; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -767,7 +767,7 @@
 
 
 define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) {
-; CHECK-LABEL: LV: Checking a loop in "update_2_uses_in_same_recipe_in_merged_block"
+; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -834,7 +834,7 @@
 }
 
 define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) {
-; CHECK-LABEL: LV: Checking a loop in "recipe_in_merge_candidate_used_by_first_order_recurrence"
+; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -918,7 +918,7 @@
 }
 
 define void @update_multiple_users(i16* noalias %src, i8* noalias %dst, i1 %c) {
-; CHECK-LABEL: LV: Checking a loop in "update_multiple_users"
+; CHECK-LABEL: LV: Checking a loop in 'update_multiple_users'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
@@ -989,7 +989,7 @@
 }
 
 define void @sinking_requires_duplication(float* %addr) {
-; CHECK-LABEL: LV: Checking a loop in "sinking_requires_duplication"
+; CHECK-LABEL: LV: Checking a loop in 'sinking_requires_duplication'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -974,12 +974,13 @@
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; AMDGPU-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; AMDGPU-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR8]]
-; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; AMDGPU-NEXT:    ret void
 ;
 ;
@@ -1106,9 +1107,10 @@
 ; AMDGPU-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
 ; AMDGPU-NEXT:    call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR8]]
 ; AMDGPU-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR8]]
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; AMDGPU-NEXT:    call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]]
 ; AMDGPU-NEXT:    ret void
 ;
@@ -1251,12 +1253,13 @@
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; AMDGPU-NEXT:    [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]]
-; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; AMDGPU-NEXT:    ret void
 ;
 ;
@@ -1375,12 +1378,13 @@
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; AMDGPU-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; AMDGPU-NEXT:    ret void
 ;
 ;
@@ -1500,11 +1504,12 @@
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; AMDGPU-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
-; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; AMDGPU-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; AMDGPU-NEXT:    ret void
 ;
 ;
@@ -1626,7 +1631,8 @@
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; AMDGPU-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; AMDGPU-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0
 ; AMDGPU-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; AMDGPU:       if.then:
 ; AMDGPU-NEXT:    br label [[RETURN:%.*]]
@@ -1949,12 +1955,13 @@
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; NVPTX-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; NVPTX-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR8]]
-; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; NVPTX-NEXT:    ret void
 ;
 ;
@@ -2080,9 +2087,10 @@
 ; NVPTX-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
 ; NVPTX-NEXT:    call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR8]]
 ; NVPTX-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR8]]
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; NVPTX-NEXT:    call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -2224,12 +2232,13 @@
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; NVPTX-NEXT:    [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]]
-; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; NVPTX-NEXT:    ret void
 ;
 ;
@@ -2347,12 +2356,13 @@
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; NVPTX-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; NVPTX-NEXT:    ret void
 ;
 ;
@@ -2471,11 +2481,12 @@
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; NVPTX-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
-; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; NVPTX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; NVPTX-NEXT:    ret void
 ;
 ;
@@ -2596,7 +2607,8 @@
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; NVPTX-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; NVPTX-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0
 ; NVPTX-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; NVPTX:       if.then:
 ; NVPTX-NEXT:    br label [[RETURN:%.*]]
@@ -2878,12 +2890,13 @@
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; AMDGPU-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; AMDGPU-DISABLED-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR8]]
-; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
 ; AMDGPU-DISABLED-NEXT:    ret void
 ;
 ;
@@ -2963,9 +2976,10 @@
 ; AMDGPU-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
 ; AMDGPU-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR8]]
 ; AMDGPU-DISABLED-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR8]]
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; AMDGPU-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]]
 ; AMDGPU-DISABLED-NEXT:    ret void
 ;
@@ -3065,12 +3079,13 @@
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; AMDGPU-DISABLED-NEXT:    [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]]
-; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
 ; AMDGPU-DISABLED-NEXT:    ret void
 ;
 ;
@@ -3148,12 +3163,13 @@
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; AMDGPU-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
 ; AMDGPU-DISABLED-NEXT:    ret void
 ;
 ;
@@ -3232,11 +3248,12 @@
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; AMDGPU-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; AMDGPU-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
-; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
 ; AMDGPU-DISABLED-NEXT:    ret void
 ;
 ;
@@ -3321,7 +3338,8 @@
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; AMDGPU-DISABLED-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0
 ; AMDGPU-DISABLED-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; AMDGPU-DISABLED:       if.then:
 ; AMDGPU-DISABLED-NEXT:    br label [[RETURN:%.*]]
@@ -3573,12 +3591,13 @@
 ; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; NVPTX-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; NVPTX-DISABLED-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR8]]
-; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
 ; NVPTX-DISABLED-NEXT:    ret void
 ;
 ;
@@ -3658,9 +3677,10 @@
 ; NVPTX-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
 ; NVPTX-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR8]]
 ; NVPTX-DISABLED-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR8]]
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; NVPTX-DISABLED-NEXT:    call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]]
 ; NVPTX-DISABLED-NEXT:    ret void
 ;
@@ -3760,12 +3780,13 @@
 ; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; NVPTX-DISABLED-NEXT:    [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]]
-; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
 ; NVPTX-DISABLED-NEXT:    ret void
 ;
 ;
@@ -3843,12 +3864,13 @@
 ; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
 ; NVPTX-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
 ; NVPTX-DISABLED-NEXT:    ret void
 ;
 ;
@@ -3927,11 +3949,12 @@
 ; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
 ; NVPTX-DISABLED-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 ; NVPTX-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR9]]
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
-; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
+; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
 ; NVPTX-DISABLED-NEXT:    ret void
 ;
 ;
@@ -4016,7 +4039,8 @@
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; NVPTX-DISABLED-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0
 ; NVPTX-DISABLED-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; NVPTX-DISABLED:       if.then:
 ; NVPTX-DISABLED-NEXT:    br label [[RETURN:%.*]]
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -84,7 +84,15 @@
   ret void
 }
 
-declare i8* @__kmpc_alloc_shared(i64)
+@offset =global i32 undef
+@stack = internal addrspace(3) global [1024 x i8] undef
+define private i8* @__kmpc_alloc_shared(i64) {
+  %bc = bitcast [1024 x i8] addrspace(3) * @stack to i8 addrspace(3) *
+  %ac = addrspacecast i8 addrspace(3) * %bc to i8*
+  %l = load i32, i32* @offset
+  %gep = getelementptr i8, i8* %ac, i32 %l
+  ret i8* %gep
+}
 
 declare void @__kmpc_free_shared(i8*, i64)
 
@@ -122,33 +130,41 @@
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global i8*
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [113 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
+; CHECK: @[[OFFSET:[a-zA-Z0-9_$"\\.-]+]] = global i32 undef
+; CHECK: @[[STACK:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [1024 x i8] undef
 ; CHECK: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] undef, align 4
 ; CHECK: @[[Y_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
 ;.
 ; CHECK-LABEL: define {{[^@]+}}@foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
-; CHECK-NEXT:    [[X:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 4) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 [[X]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[X]], i64 4) #[[ATTR5]]
+; CHECK-NEXT:    [[X:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[X]] to i32*
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[X_ON_STACK]] to i8*
+; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 [[TMP0]]) #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[X]], i64 4) #[[ATTR8:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bar() {
 ; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
-; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR4]]
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR5]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[MASTER1:%.*]], label [[EXIT:%.*]]
 ; CHECK:       master1:
-; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x_shared, i32 0, i32 0) to i8*)) #[[ATTR6]]
+; CHECK-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x_shared, i32 0, i32 0) to i8*) to [4 x i32]*
+; CHECK-NEXT:    [[A0:%.*]] = bitcast [4 x i32]* [[X_ON_STACK]] to i8*
+; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 [[A0]]) #[[ATTR7]]
 ; CHECK-NEXT:    br label [[NEXT:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR4]]
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR5]]
 ; CHECK-NEXT:    br label [[MASTER2:%.*]]
 ; CHECK:       master2:
-; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y_shared, i32 0, i32 0) to i8*)) #[[ATTR6]]
+; CHECK-NEXT:    [[Y_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y_shared, i32 0, i32 0) to i8*) to [4 x i32]*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast [4 x i32]* [[Y_ON_STACK]] to i8*
+; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 [[B1]]) #[[ATTR7]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -157,13 +173,15 @@
 ;
 ; CHECK-LABEL: define {{[^@]+}}@baz_spmd() {
 ; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 true, i1 true)
-; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR4]]
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR5]]
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[C0]], label [[MASTER3:%.*]], label [[EXIT:%.*]]
 ; CHECK:       master3:
-; CHECK-NEXT:    [[Z:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 24) #[[ATTR5]], !dbg [[DBG9:![0-9]+]]
-; CHECK-NEXT:    call void @use.internalized(i8* nofree writeonly align 4 [[Z]]) #[[ATTR6]]
-; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[Z]], i64 24) #[[ATTR5]]
+; CHECK-NEXT:    [[Z:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 noundef 24) #[[ATTR6]], !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    [[Z_ON_STACK:%.*]] = bitcast i8* [[Z]] to [6 x i32]*
+; CHECK-NEXT:    [[C1:%.*]] = bitcast [6 x i32]* [[Z_ON_STACK]] to i8*
+; CHECK-NEXT:    call void @use.internalized(i8* nofree writeonly align 4 [[C1]]) #[[ATTR7]]
+; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[Z]], i64 24) #[[ATTR8]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
@@ -184,14 +202,23 @@
 ; CHECK-NEXT:    store i8* [[X]], i8** @S, align 8
 ; CHECK-NEXT:    ret void
 ;
+;
+; CHECK-LABEL: define {{[^@]+}}@__kmpc_alloc_shared
+; CHECK-SAME: (i64 [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[L:%.*]] = load i32, i32* @offset, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([1024 x i8], [1024 x i8] addrspace(3)* @stack, i32 0, i32 0) to i8*), i32 [[L]]
+; CHECK-NEXT:    ret i8* [[GEP]]
+;
 ;.
 ; CHECK: attributes #[[ATTR0]] = { nofree norecurse nosync nounwind willreturn writeonly }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nosync nounwind }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind readnone speculatable }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { nofree nosync nounwind readnone speculatable willreturn }
-; CHECK: attributes #[[ATTR4]] = { "llvm.assume"="omp_no_openmp" }
-; CHECK: attributes #[[ATTR5]] = { nounwind }
-; CHECK: attributes #[[ATTR6]] = { nounwind writeonly }
+; CHECK: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readonly willreturn allocsize(0) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nosync nounwind }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind readnone speculatable }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nofree nosync nounwind readnone speculatable willreturn }
+; CHECK: attributes #[[ATTR5]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK: attributes #[[ATTR6]] = { nounwind readonly }
+; CHECK: attributes #[[ATTR7]] = { nounwind writeonly }
+; CHECK: attributes #[[ATTR8]] = { nounwind }
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
 ; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c")
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -2169,25 +2169,25 @@
 ; Function Attrs: alwaysinline convergent nounwind
 define internal void @.omp_outlined.(i32 %.global_tid., i32* noalias %.part_id., i8* noalias %.privates., void (i8*, ...)* noalias %.copy_fn., i8* %.task_t., %struct.anon* noalias %__context) #9 {
 ; AMDGPU-LABEL: define {{[^@]+}}@.omp_outlined.
-; AMDGPU-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR0]] {
+; AMDGPU-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone align 4294967296 [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR5]]
 ; AMDGPU-NEXT:    ret void
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@.omp_outlined.
-; NVPTX-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR0]] {
+; NVPTX-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone align 4294967296 [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR5]]
 ; NVPTX-NEXT:    ret void
 ;
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined.
-; AMDGPU-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR0]] {
+; AMDGPU-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone align 4294967296 [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR5]]
 ; AMDGPU-DISABLED-NEXT:    ret void
 ;
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined.
-; NVPTX-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR0]] {
+; NVPTX-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone align 4294967296 [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR5]]
 ; NVPTX-DISABLED-NEXT:    ret void
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-filter.ll b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-filter.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-filter.ll
@@ -0,0 +1,9 @@
+; RUN: opt < %s -passes='print<cost-model>' -mtriple=x86_64-pc-linux-gnu 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+
+define void @replication_i64_stride2() nounwind {
+  %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  ret void
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-filter.ll.filter.expected b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-filter.ll.filter.expected
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-filter.ll.filter.expected
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "(vf4|vf16)"
+; RUN: opt < %s -passes='print<cost-model>' -mtriple=x86_64-pc-linux-gnu 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+
+define void @replication_i64_stride2() nounwind {
+; SSE2-LABEL: 'replication_i64_stride2'
+; SSE2:  Cost Model: Found an estimated cost of 18 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+; SSE2:  Cost Model: Found an estimated cost of 72 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+;
+  %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  ret void
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/filter.test b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/filter.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/filter.test
@@ -0,0 +1,13 @@
+# REQUIRES: x86-registered-target
+
+## Check that --filter works properly.
+# RUN: cp -f %S/Inputs/x86-filter.ll %t.ll && %update_analyze_test_checks --filter="(vf4|vf16)" %t.ll
+# RUN: diff -u %t.ll %S/Inputs/x86-filter.ll.filter.expected
+
+## Check that running the script again does not change the result:
+# RUN: %update_analyze_test_checks --filter="(vf4|vf16)" %t.ll
+# RUN: diff -u %t.ll %S/Inputs/x86-filter.ll.filter.expected
+
+## Check that running the script again, without arguments, does not change the result:
+# RUN: %update_analyze_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/x86-filter.ll.filter.expected
diff --git a/llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove.test b/llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove-nonempty-segment.test
copy from llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove.test
copy to llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove-nonempty-segment.test
--- a/llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove-nonempty-segment.test
@@ -1,7 +1,7 @@
-## Test bitcode section removal.
+## Test bitcode segment is not removed when not empty.
 # RUN: yaml2obj %s -o %t
 # RUN: llvm-bitcode-strip -r %t -o %t2
-# RUN: llvm-readobj --sections %t2 | FileCheck --implicit-check-not=Name: %s
+# RUN: llvm-readobj --macho-segment --sections %t2 | FileCheck --implicit-check-not=Name: %s
 
 # CHECK:      Name: __text
 # CHECK-NEXT: Segment: __TEXT
@@ -9,6 +9,12 @@
 # CHECK-NEXT: Segment: __DATA
 # CHECK:      Name: __notbundle
 # CHECK-NEXT: Segment: __LLVM
+# CHECK:      Cmd: LC_SEGMENT_64
+# CHECK-NEXT: Name: __TEXT
+# CHECK:      Cmd: LC_SEGMENT_64
+# CHECK-NEXT: Name: __DATA
+# CHECK:      Cmd: LC_SEGMENT_64
+# CHECK-NEXT: Name: __LLVM
 
 --- !mach-o
 FileHeader:
@@ -16,21 +22,21 @@
   cputype:         0x01000007
   cpusubtype:      0x00000003
   filetype:        0x00000001
-  ncmds:           1
-  sizeofcmds:      392
+  ncmds:           3
+  sizeofcmds:      536
   flags:           0x00002000
   reserved:        0x00000000
 LoadCommands:
   - cmd:             LC_SEGMENT_64
-    cmdsize:         392
-    segname:         ''
+    cmdsize:         152
+    segname:         __TEXT
     vmaddr:          0
-    vmsize:          16
-    fileoff:         424
-    filesize:        16
+    vmsize:          4
+    fileoff:         568
+    filesize:        4
     maxprot:         7
     initprot:        7
-    nsects:          4
+    nsects:          1
     flags:           0
     Sections:
       - sectname:        __text
@@ -38,7 +44,7 @@
         addr:            0x0000000000000000
         content:         'AABBCCDD'
         size:            4
-        offset:          424
+        offset:          568
         align:           0
         reloff:          0x00000000
         nreloc:          0
@@ -46,12 +52,24 @@
         reserved1:       0x00000000
         reserved2:       0x00000000
         reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          4
+    vmsize:          4
+    fileoff:         572
+    filesize:        4
+    maxprot:         7
+    initprot:        7
+    nsects:          1
+    flags:           0
+    Sections:
       - sectname:        __bundle
         segname:         __DATA
         addr:            0x0000000000000004
         content:         'DDAADDAA'
         size:            4
-        offset:          428
+        offset:          572
         align:           0
         reloff:          0x00000000
         nreloc:          0
@@ -59,12 +77,24 @@
         reserved1:       0x00000000
         reserved2:       0x00000000
         reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __LLVM
+    vmaddr:          8
+    vmsize:          8
+    fileoff:         576
+    filesize:        8
+    maxprot:         7
+    initprot:        7
+    nsects:          2
+    flags:           0
+    Sections:
       - sectname:        __bundle
         segname:         __LLVM
         addr:            0x0000000000000008
         content:         'EEFFEEFF'
         size:            4
-        offset:          432
+        offset:          576
         align:           0
         reloff:          0x00000000
         nreloc:          0
@@ -77,7 +107,7 @@
         addr:            0x0000000000000008
         content:         'EEFFEEFF'
         size:            4
-        offset:          436
+        offset:          580
         align:           0
         reloff:          0x00000000
         nreloc:          0
diff --git a/llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove.test b/llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove.test
--- a/llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/bitcode-strip-remove.test
@@ -1,14 +1,16 @@
-## Test bitcode section removal.
+## Test bitcode section and segment removal.
 # RUN: yaml2obj %s -o %t
 # RUN: llvm-bitcode-strip -r %t -o %t2
-# RUN: llvm-readobj --sections %t2 | FileCheck --implicit-check-not=Name: %s
+# RUN: llvm-readobj --macho-segment --sections %t2 | FileCheck --implicit-check-not=Name: %s
 
 # CHECK:      Name: __text
 # CHECK-NEXT: Segment: __TEXT
 # CHECK:      Name: __bundle
 # CHECK-NEXT: Segment: __DATA
-# CHECK:      Name: __notbundle
-# CHECK-NEXT: Segment: __LLVM
+# CHECK:      Cmd: LC_SEGMENT_64
+# CHECK-NEXT: Name: __TEXT
+# CHECK:      Cmd: LC_SEGMENT_64
+# CHECK-NEXT: Name: __DATA
 
 --- !mach-o
 FileHeader:
@@ -16,21 +18,21 @@
   cputype:         0x01000007
   cpusubtype:      0x00000003
   filetype:        0x00000001
-  ncmds:           1
-  sizeofcmds:      392
+  ncmds:           3
+  sizeofcmds:      456
   flags:           0x00002000
   reserved:        0x00000000
 LoadCommands:
   - cmd:             LC_SEGMENT_64
-    cmdsize:         392
-    segname:         ''
+    cmdsize:         152
+    segname:         __TEXT
     vmaddr:          0
-    vmsize:          16
-    fileoff:         424
-    filesize:        16
+    vmsize:          4
+    fileoff:         488
+    filesize:        4
     maxprot:         7
     initprot:        7
-    nsects:          4
+    nsects:          1
     flags:           0
     Sections:
       - sectname:        __text
@@ -38,7 +40,7 @@
         addr:            0x0000000000000000
         content:         'AABBCCDD'
         size:            4
-        offset:          424
+        offset:          488
         align:           0
         reloff:          0x00000000
         nreloc:          0
@@ -46,12 +48,24 @@
         reserved1:       0x00000000
         reserved2:       0x00000000
         reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          4
+    vmsize:          4
+    fileoff:         492
+    filesize:        4
+    maxprot:         7
+    initprot:        7
+    nsects:          1
+    flags:           0
+    Sections:
       - sectname:        __bundle
         segname:         __DATA
         addr:            0x0000000000000004
         content:         'DDAADDAA'
         size:            4
-        offset:          428
+        offset:          492
         align:           0
         reloff:          0x00000000
         nreloc:          0
@@ -59,25 +73,24 @@
         reserved1:       0x00000000
         reserved2:       0x00000000
         reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __LLVM
+    vmaddr:          8
+    vmsize:          4
+    fileoff:         496
+    filesize:        4
+    maxprot:         7
+    initprot:        7
+    nsects:          1
+    flags:           0
+    Sections:
       - sectname:        __bundle
         segname:         __LLVM
         addr:            0x0000000000000008
         content:         'EEFFEEFF'
         size:            4
-        offset:          432
-        align:           0
-        reloff:          0x00000000
-        nreloc:          0
-        flags:           0x00000000
-        reserved1:       0x00000000
-        reserved2:       0x00000000
-        reserved3:       0x00000000
-      - sectname:        __notbundle
-        segname:         __LLVM
-        addr:            0x0000000000000008
-        content:         'EEFFEEFF'
-        size:            4
-        offset:          436
+        offset:          496
         align:           0
         reloff:          0x00000000
         nreloc:          0
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -14,6 +14,7 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/ObjCopy/ConfigManager.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/CRC.h"
@@ -1189,6 +1190,7 @@
   DriverConfig DC;
   ConfigManager ConfigMgr;
   CommonConfig &Config = ConfigMgr.Common;
+  MachOConfig &MachOConfig = ConfigMgr.MachO;
   BitcodeStripOptTable T;
   unsigned MissingArgumentIndex, MissingArgumentCount;
   opt::InputArgList InputArgs =
@@ -1233,9 +1235,11 @@
   if (!InputArgs.hasArg(BITCODE_STRIP_remove))
     return createStringError(errc::invalid_argument, "no action specified");
 
-  // We only support -r for now, which removes all bitcode sections.
+  // We only support -r for now, which removes all bitcode sections and
+  // the __LLVM segment if it's now empty.
   cantFail(Config.ToRemove.addMatcher(NameOrPattern::create(
       "__LLVM,__bundle", MatchStyle::Literal, ErrorCallback)));
+  MachOConfig.EmptySegmentsToRemove.insert("__LLVM");
 
   DC.CopyConfigs.push_back(std::move(ConfigMgr));
   return std::move(DC);
diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt
--- a/llvm/unittests/IR/CMakeLists.txt
+++ b/llvm/unittests/IR/CMakeLists.txt
@@ -41,6 +41,7 @@
   ValueHandleTest.cpp
   ValueMapTest.cpp
   ValueTest.cpp
+  VectorBuilderTest.cpp
   VectorTypesTest.cpp
   VerifierTest.cpp
   VPIntrinsicTest.cpp
diff --git a/llvm/unittests/IR/VectorBuilderTest.cpp b/llvm/unittests/IR/VectorBuilderTest.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/unittests/IR/VectorBuilderTest.cpp
@@ -0,0 +1,280 @@
+//===--------- VectorBuilderTest.cpp - VectorBuilder unit tests -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/VectorBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+static unsigned VectorNumElements = 8;
+
+class VectorBuilderTest : public testing::Test {
+protected:
+  LLVMContext Context;
+
+  VectorBuilderTest() : Context() {}
+
+  std::unique_ptr<Module> createBuilderModule(Function *&Func, BasicBlock *&BB,
+                                              Value *&Mask, Value *&EVL) {
+    auto Mod = std::make_unique<Module>("TestModule", Context);
+    auto *Int32Ty = Type::getInt32Ty(Context);
+    auto *Mask8Ty =
+        FixedVectorType::get(Type::getInt1Ty(Context), VectorNumElements);
+    auto *VoidFuncTy =
+        FunctionType::get(Type::getVoidTy(Context), {Mask8Ty, Int32Ty}, false);
+    Func =
+        Function::Create(VoidFuncTy, GlobalValue::ExternalLinkage, "bla", *Mod);
+    Mask = Func->getArg(0);
+    EVL = Func->getArg(1);
+    BB = BasicBlock::Create(Context, "entry", Func);
+
+    return Mod;
+  }
+};
+
+/// Check that creating binary arithmetic VP intrinsics works.
+TEST_F(VectorBuilderTest, TestCreateBinaryInstructions) {
+  Function *F;
+  BasicBlock *BB;
+  Value *Mask, *EVL;
+  auto Mod = createBuilderModule(F, BB, Mask, EVL);
+
+  IRBuilder<> Builder(BB);
+  VectorBuilder VBuild(Builder);
+  VBuild.setMask(Mask).setEVL(EVL);
+
+  auto *FloatVecTy =
+      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
+  auto *IntVecTy =
+      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
+
+#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
+  {                                                                            \
+    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
+    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
+    auto *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
+    Value *Op = UndefValue::get(ValueTy);                                      \
+    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
+                                             {Op, Op});                        \
+    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
+    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
+    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
+    ASSERT_EQ(VPIntrin->getMaskParam(), Mask);                                 \
+    ASSERT_EQ(VPIntrin->getVectorLengthParam(), EVL);                          \
+  }
+#include "llvm/IR/Instruction.def"
+}
+
+static bool isAllTrueMask(Value *Val, unsigned NumElements) {
+  auto *ConstMask = dyn_cast<Constant>(Val);
+  if (!ConstMask)
+    return false;
+
+  // Structure check.
+  if (!ConstMask->isAllOnesValue())
+    return false;
+
+  // Type check.
+  auto *MaskVecTy = cast<FixedVectorType>(ConstMask->getType());
+  if (MaskVecTy->getNumElements() != NumElements)
+    return false;
+
+  return MaskVecTy->getElementType()->isIntegerTy(1);
+}
+
+/// Check that creating binary arithmetic VP intrinsics works.
+TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoMask) {
+  Function *F;
+  BasicBlock *BB;
+  Value *Mask, *EVL;
+  auto Mod = createBuilderModule(F, BB, Mask, EVL);
+
+  IRBuilder<> Builder(BB);
+  VectorBuilder VBuild(Builder);
+  VBuild.setEVL(EVL).setStaticVL(VectorNumElements);
+
+  auto *FloatVecTy =
+      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
+  auto *IntVecTy =
+      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
+
+#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
+  {                                                                            \
+    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
+    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
+    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
+    Value *Op = UndefValue::get(ValueTy);                                      \
+    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
+                                             {Op, Op});                        \
+    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
+    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
+    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
+    ASSERT_TRUE(isAllTrueMask(VPIntrin->getMaskParam(), VectorNumElements));   \
+    ASSERT_EQ(VPIntrin->getVectorLengthParam(), EVL);                          \
+  }
+#include "llvm/IR/Instruction.def"
+}
+
+static bool isLegalConstEVL(Value *Val, unsigned ExpectedEVL) {
+  auto *ConstEVL = dyn_cast<ConstantInt>(Val);
+  if (!ConstEVL)
+    return false;
+
+  // Value check.
+  if (ConstEVL->getZExtValue() != ExpectedEVL)
+    return false;
+
+  // Type check.
+  return ConstEVL->getType()->isIntegerTy(32);
+}
+
+/// Check that creating binary arithmetic VP intrinsics works.
+TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoEVL) {
+  Function *F;
+  BasicBlock *BB;
+  Value *Mask, *EVL;
+  auto Mod = createBuilderModule(F, BB, Mask, EVL);
+
+  IRBuilder<> Builder(BB);
+  VectorBuilder VBuild(Builder);
+  VBuild.setMask(Mask).setStaticVL(VectorNumElements);
+
+  auto *FloatVecTy =
+      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
+  auto *IntVecTy =
+      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
+
+#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
+  {                                                                            \
+    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
+    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
+    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
+    Value *Op = UndefValue::get(ValueTy);                                      \
+    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
+                                             {Op, Op});                        \
+    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
+    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
+    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
+    ASSERT_EQ(VPIntrin->getMaskParam(), Mask);                                 \
+    ASSERT_TRUE(                                                               \
+        isLegalConstEVL(VPIntrin->getVectorLengthParam(), VectorNumElements)); \
+  }
+#include "llvm/IR/Instruction.def"
+}
+
+/// Check that creating binary arithmetic VP intrinsics works.
+TEST_F(VectorBuilderTest,
+       TestCreateBinaryInstructions_FixedVector_NoMask_NoEVL) {
+  Function *F;
+  BasicBlock *BB;
+  Value *Mask, *EVL;
+  auto Mod = createBuilderModule(F, BB, Mask, EVL);
+
+  IRBuilder<> Builder(BB);
+  VectorBuilder VBuild(Builder);
+  VBuild.setStaticVL(VectorNumElements);
+
+  auto *FloatVecTy =
+      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
+  auto *IntVecTy =
+      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
+
+#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
+  {                                                                            \
+    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
+    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
+    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
+    Value *Op = UndefValue::get(ValueTy);                                      \
+    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
+                                             {Op, Op});                        \
+    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
+    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
+    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
+    ASSERT_TRUE(isAllTrueMask(VPIntrin->getMaskParam(), VectorNumElements));   \
+    ASSERT_TRUE(                                                               \
+        isLegalConstEVL(VPIntrin->getVectorLengthParam(), VectorNumElements)); \
+  }
+#include "llvm/IR/Instruction.def"
+}
+/// Check that creating vp.load/vp.store works.
+TEST_F(VectorBuilderTest, TestCreateLoadStore) {
+  Function *F;
+  BasicBlock *BB;
+  Value *Mask, *EVL;
+  auto Mod = createBuilderModule(F, BB, Mask, EVL);
+
+  IRBuilder<> Builder(BB);
+  VectorBuilder VBuild(Builder);
+  VBuild.setMask(Mask).setEVL(EVL);
+
+  auto *FloatVecTy =
+      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
+  auto *FloatVecPtrTy = FloatVecTy->getPointerTo();
+
+  Value *FloatVecPtr = UndefValue::get(FloatVecPtrTy);
+  Value *FloatVec = UndefValue::get(FloatVecTy);
+
+  // vp.load
+  auto LoadVPID = VPIntrinsic::getForOpcode(Instruction::Load);
+  auto *LoadIntrin = VBuild.createVectorInstruction(Instruction::Load,
+                                                    FloatVecTy, {FloatVecPtr});
+  ASSERT_TRUE(isa<VPIntrinsic>(LoadIntrin));
+  auto *VPLoad = cast<VPIntrinsic>(LoadIntrin);
+  ASSERT_EQ(VPLoad->getIntrinsicID(), LoadVPID);
+  ASSERT_EQ(VPLoad->getMemoryPointerParam(), FloatVecPtr);
+
+  // vp.store
+  auto *VoidTy = Builder.getVoidTy();
+  auto StoreVPID = VPIntrinsic::getForOpcode(Instruction::Store);
+  auto *StoreIntrin = VBuild.createVectorInstruction(Instruction::Store, VoidTy,
+                                                     {FloatVec, FloatVecPtr});
+  ASSERT_TRUE(isa<VPIntrinsic>(LoadIntrin));
+  auto *VPStore = cast<VPIntrinsic>(StoreIntrin);
+  ASSERT_EQ(VPStore->getIntrinsicID(), StoreVPID);
+  ASSERT_EQ(VPStore->getMemoryPointerParam(), FloatVecPtr);
+  ASSERT_EQ(VPStore->getMemoryDataParam(), FloatVec);
+}
+
+/// Check that the SilentlyReturnNone error handling mode works.
+TEST_F(VectorBuilderTest, TestFail_SilentlyReturnNone) {
+  Function *F;
+  BasicBlock *BB;
+  Value *Mask, *EVL;
+  auto Mod = createBuilderModule(F, BB, Mask, EVL);
+
+  IRBuilder<> Builder(BB);
+  auto *VoidTy = Builder.getVoidTy();
+  VectorBuilder VBuild(Builder, VectorBuilder::Behavior::SilentlyReturnNone);
+  VBuild.setMask(Mask).setEVL(EVL);
+  auto *Val = VBuild.createVectorInstruction(Instruction::Br, VoidTy, {});
+  ASSERT_EQ(Val, nullptr);
+}
+
+/// Check that the ReportAndFail error handling mode aborts as advertised.
+TEST_F(VectorBuilderTest, TestFail_ReportAndAbort) {
+  Function *F;
+  BasicBlock *BB;
+  Value *Mask, *EVL;
+  auto Mod = createBuilderModule(F, BB, Mask, EVL);
+
+  IRBuilder<> Builder(BB);
+  auto *VoidTy = Builder.getVoidTy();
+  VectorBuilder VBuild(Builder, VectorBuilder::Behavior::ReportAndAbort);
+  VBuild.setMask(Mask).setEVL(EVL);
+  ASSERT_DEATH({ VBuild.createVectorInstruction(Instruction::Br, VoidTy, {}); },
+               "No VPIntrinsic for this opcode");
+}
+
+} // end anonymous namespace
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -916,12 +916,12 @@
              check_label_format, False, preserve_names, global_vars_seen_dict,
              is_filtered)
 
-def add_analyze_checks(output_lines, comment_marker, prefix_list, func_dict, func_name):
+def add_analyze_checks(output_lines, comment_marker, prefix_list, func_dict, func_name, is_filtered):
   check_label_format = '{} %s-LABEL: \'%s%s\''.format(comment_marker)
   global_vars_seen_dict = {}
   add_checks(output_lines, comment_marker, prefix_list, func_dict, func_name,
              check_label_format, False, True, global_vars_seen_dict,
-             is_filtered = False)
+             is_filtered)
 
 def build_global_values_dictionary(glob_val_dict, raw_tool_output, prefixes):
   for nameless_value in nameless_values:
diff --git a/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
--- a/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
@@ -7,13 +7,13 @@
     "//llvm/lib/Support",
   ]
   sources = [
+    "DirectiveMap.cpp",
     "Grammar.cpp",
     "GrammarBNF.cpp",
     "LRGraph.cpp",
     "LRTable.cpp",
     "LRTableBuild.cpp",
     "Lex.cpp",
-    "Preprocess.cpp",
     "Token.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
--- a/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
@@ -11,9 +11,9 @@
     "//llvm/lib/Testing/Support",
   ]
   sources = [
+    "DirectiveMapTest.cpp",
     "GrammarTest.cpp",
     "LRTableTest.cpp",
-    "PreprocessTest.cpp",
     "TokenTest.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -121,6 +121,7 @@
       "__algorithm/pop_heap.h",
       "__algorithm/prev_permutation.h",
       "__algorithm/push_heap.h",
+      "__algorithm/ranges_max_element.h",
       "__algorithm/ranges_min_element.h",
       "__algorithm/ranges_swap_ranges.h",
       "__algorithm/remove.h",
@@ -596,6 +597,7 @@
       "type_traits",
       "typeindex",
       "typeinfo",
+      "uchar.h",
       "unordered_map",
       "unordered_set",
       "utility",
diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
@@ -76,6 +76,7 @@
     "User.cpp",
     "Value.cpp",
     "ValueSymbolTable.cpp",
+    "VectorBuilder.cpp",
     "Verifier.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
@@ -44,6 +44,7 @@
     "ValueHandleTest.cpp",
     "ValueMapTest.cpp",
     "ValueTest.cpp",
+    "VectorBuilderTest.cpp",
     "VectorTypesTest.cpp",
     "VerifierTest.cpp",
   ]
diff --git a/llvm/utils/update_analyze_test_checks.py b/llvm/utils/update_analyze_test_checks.py
--- a/llvm/utils/update_analyze_test_checks.py
+++ b/llvm/utils/update_analyze_test_checks.py
@@ -32,19 +32,12 @@
 from __future__ import print_function
 
 import argparse
-import glob
-import itertools
 import os         # Used to advertise this file's name ("autogenerated_note").
-import string
-import subprocess
 import sys
-import tempfile
 import re
 
 from UpdateTestChecks import common
 
-ADVERT = '; NOTE: Assertions have been autogenerated by '
-
 def main():
   from argparse import RawTextHelpFormatter
   parser = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
@@ -53,34 +46,26 @@
   parser.add_argument(
       '--function', help='The function in the test file to update')
   parser.add_argument('tests', nargs='+')
-  args = common.parse_commandline_args(parser)
+  initial_args = common.parse_commandline_args(parser)
 
   script_name = os.path.basename(__file__)
-  autogenerated_note = (ADVERT + 'utils/' + script_name)
 
-  opt_basename = os.path.basename(args.opt_binary)
+  opt_basename = os.path.basename(initial_args.opt_binary)
   if (opt_basename != "opt"):
     common.error('Unexpected opt name: ' + opt_basename)
     sys.exit(1)
 
-  test_paths = [test for pattern in args.tests for test in glob.glob(pattern)]
-  for test in test_paths:
-    with open(test) as f:
-      input_lines = [l.rstrip() for l in f]
-
-    first_line = input_lines[0] if input_lines else ""
-    if 'autogenerated' in first_line and script_name not in first_line:
-      common.warn("Skipping test which wasn't autogenerated by " + script_name + ": " + test)
-      continue
-
-    if args.update_only:
-      if not first_line or 'autogenerated' not in first_line:
-        common.warn("Skipping test which isn't autogenerated: " + test)
-        continue
+  for ti in common.itertests(initial_args.tests, parser,
+                             script_name='utils/' + script_name):
+    triple_in_ir = None
+    for l in ti.input_lines:
+      m = common.TRIPLE_IR_RE.match(l)
+      if m:
+        triple_in_ir = m.groups()[0]
+        break
 
-    run_lines = common.find_run_lines(test, input_lines)
     prefix_list = []
-    for l in run_lines:
+    for l in ti.run_lines:
       if '|' not in l:
         common.warn('Skipping unparseable RUN line: ' + l)
         continue
@@ -111,19 +96,19 @@
     builder = common.FunctionTestBuilder(
       run_list = prefix_list,
       flags = type('', (object,), {
-            'verbose': args.verbose,
-            'filters': args.filters,
+            'verbose': ti.args.verbose,
+            'filters': ti.args.filters,
             'function_signature': False,
             'check_attributes': False,
             'replace_value_regex': []}),
       scrubber_args = [],
-      path=test)
+      path=ti.path)
 
     for prefixes, opt_args in prefix_list:
       common.debug('Extracted opt cmd:', opt_basename, opt_args, file=sys.stderr)
       common.debug('Extracted FileCheck prefixes:', str(prefixes), file=sys.stderr)
 
-      raw_tool_outputs = common.invoke_tool(args.opt_binary, opt_args, test)
+      raw_tool_outputs = common.invoke_tool(ti.args.opt_binary, opt_args, ti.path)
 
       # Split analysis outputs by "Printing analysis " declarations.
       for raw_tool_output in re.split(r'Printing analysis ', raw_tool_outputs):
@@ -136,9 +121,10 @@
     prefix_set = set([prefix for prefixes, _ in prefix_list for prefix in prefixes])
     common.debug('Rewriting FileCheck prefixes:', str(prefix_set), file=sys.stderr)
     output_lines = []
-    output_lines.append(autogenerated_note)
 
-    for input_line in input_lines:
+    for input_info in ti.iterlines(output_lines):
+      input_line = input_info.line
+      args = input_info.args
       if is_in_function_start:
         if input_line == '':
           continue
@@ -149,7 +135,8 @@
             continue
 
         # Print out the various check lines here.
-        common.add_analyze_checks(output_lines, ';', prefix_list, func_dict, func_name)
+        common.add_analyze_checks(output_lines, ';', prefix_list, func_dict, func_name,
+                                  is_filtered=builder.is_filtered())
         is_in_function_start = False
 
       if is_in_function:
@@ -164,10 +151,6 @@
           is_in_function = False
         continue
 
-      # Discard any previous script advertising.
-      if input_line.startswith(ADVERT):
-        continue
-
       # If it's outside a function, it just gets copied to the output.
       output_lines.append(input_line)
 
@@ -175,14 +158,14 @@
       if not m:
         continue
       func_name = m.group(1)
-      if args.function is not None and func_name != args.function:
+      if ti.args.function is not None and func_name != ti.args.function:
         # When filtering on a specific function, skip all others.
         continue
       is_in_function = is_in_function_start = True
 
-    common.debug('Writing %d lines to %s...' % (len(output_lines), test))
+    common.debug('Writing %d lines to %s...' % (len(output_lines), ti.path))
 
-    with open(test, 'wb') as f:
+    with open(ti.path, 'wb') as f:
       f.writelines(['{}\n'.format(l).encode('utf-8') for l in output_lines])
 
 
diff --git a/mlir/docs/Diagnostics.md b/mlir/docs/Diagnostics.md
--- a/mlir/docs/Diagnostics.md
+++ b/mlir/docs/Diagnostics.md
@@ -107,6 +107,18 @@
 "Compose an interesting error: @foo, i32, (0, 1, 2)"
 ```
 
+Operations attached to a diagnostic will be printed in generic form if the
+severity level is `Error`, otherwise custom operation printers will be used.
+```c++
+// `anotherOp` will be printed in generic form,
+// e.g. %3 = "arith.addf"(%arg4, %2) : (f32, f32) -> f32
+op->emitError() << anotherOp;
+
+// `anotherOp` will be printed using the custom printer,
+// e.g. %3 = arith.addf %arg4, %2 : f32
+op->emitRemark() << anotherOp;
+```
+
 ### Attaching notes
 
 Unlike many other compiler frameworks, notes in MLIR cannot be emitted directly.
diff --git a/mlir/docs/OpDefinitions.md b/mlir/docs/OpDefinitions.md
--- a/mlir/docs/OpDefinitions.md
+++ b/mlir/docs/OpDefinitions.md
@@ -601,6 +601,15 @@
 verified. Verifiers further down the order can rely on certain invariants being
 verified by a previous verifier and do not need to re-verify them.
 
+#### Emitting diagnostics in custom verifiers
+
+Custom verifiers should avoid printing operations using custom operation
+printers, because they require the printed operation (and sometimes its parent
+operation) to be verified first. In particular, when emitting diagnostics,
+custom verifiers should use the `Error` severity level, which prints operations
+in generic form by default, and avoid using lower severity levels (`Note`,
+`Remark`, `Warning`).
+
 ### Declarative Assembly Format
 
 The custom assembly form of the operation may be specified in a declarative
diff --git a/mlir/examples/standalone/standalone-opt/standalone-opt.cpp b/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
--- a/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
+++ b/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
@@ -14,7 +14,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/FileUtilities.h"
-#include "mlir/Support/MlirOptMain.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/SourceMgr.h"
diff --git a/mlir/examples/standalone/standalone-translate/CMakeLists.txt b/mlir/examples/standalone/standalone-translate/CMakeLists.txt
--- a/mlir/examples/standalone/standalone-translate/CMakeLists.txt
+++ b/mlir/examples/standalone/standalone-translate/CMakeLists.txt
@@ -17,7 +17,7 @@
   MLIRParser
   MLIRPass
   MLIRSPIRV
-  MLIRTranslation
+  MLIRTranslateLib
   MLIRSupport
   )
 
diff --git a/mlir/examples/standalone/standalone-translate/standalone-translate.cpp b/mlir/examples/standalone/standalone-translate/standalone-translate.cpp
--- a/mlir/examples/standalone/standalone-translate/standalone-translate.cpp
+++ b/mlir/examples/standalone/standalone-translate/standalone-translate.cpp
@@ -13,7 +13,7 @@
 
 #include "mlir/InitAllTranslations.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Translation.h"
+#include "mlir/Tools/mlir-translate/MlirTranslateMain.h"
 
 #include "Standalone/StandaloneDialect.h"
 
diff --git a/mlir/examples/toy/Ch2/toyc.cpp b/mlir/examples/toy/Ch2/toyc.cpp
--- a/mlir/examples/toy/Ch2/toyc.cpp
+++ b/mlir/examples/toy/Ch2/toyc.cpp
@@ -19,7 +19,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
@@ -98,7 +98,7 @@
   llvm::SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
   mlir::OwningOpRef<mlir::ModuleOp> module =
-      mlir::parseSourceFile(sourceMgr, &context);
+      mlir::parseSourceFile<mlir::ModuleOp>(sourceMgr, &context);
   if (!module) {
     llvm::errs() << "Error can't load file " << inputFilename << "\n";
     return 3;
diff --git a/mlir/examples/toy/Ch3/toyc.cpp b/mlir/examples/toy/Ch3/toyc.cpp
--- a/mlir/examples/toy/Ch3/toyc.cpp
+++ b/mlir/examples/toy/Ch3/toyc.cpp
@@ -18,7 +18,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -93,7 +93,7 @@
 
   // Parse the input mlir.
   sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
-  module = mlir::parseSourceFile(sourceMgr, &context);
+  module = mlir::parseSourceFile<mlir::ModuleOp>(sourceMgr, &context);
   if (!module) {
     llvm::errs() << "Error can't load file " << inputFilename << "\n";
     return 3;
diff --git a/mlir/examples/toy/Ch4/toyc.cpp b/mlir/examples/toy/Ch4/toyc.cpp
--- a/mlir/examples/toy/Ch4/toyc.cpp
+++ b/mlir/examples/toy/Ch4/toyc.cpp
@@ -19,7 +19,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -94,7 +94,7 @@
 
   // Parse the input mlir.
   sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
-  module = mlir::parseSourceFile(sourceMgr, &context);
+  module = mlir::parseSourceFile<mlir::ModuleOp>(sourceMgr, &context);
   if (!module) {
     llvm::errs() << "Error can't load file " << inputFilename << "\n";
     return 3;
diff --git a/mlir/examples/toy/Ch5/toyc.cpp b/mlir/examples/toy/Ch5/toyc.cpp
--- a/mlir/examples/toy/Ch5/toyc.cpp
+++ b/mlir/examples/toy/Ch5/toyc.cpp
@@ -21,7 +21,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/InitAllDialects.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -98,7 +98,7 @@
 
   // Parse the input mlir.
   sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
-  module = mlir::parseSourceFile(sourceMgr, &context);
+  module = mlir::parseSourceFile<mlir::ModuleOp>(sourceMgr, &context);
   if (!module) {
     llvm::errs() << "Error can't load file " << inputFilename << "\n";
     return 3;
diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp
--- a/mlir/examples/toy/Ch6/toyc.cpp
+++ b/mlir/examples/toy/Ch6/toyc.cpp
@@ -23,7 +23,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/InitAllDialects.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
@@ -119,7 +119,7 @@
   // Parse the input mlir.
   llvm::SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
-  module = mlir::parseSourceFile(sourceMgr, &context);
+  module = mlir::parseSourceFile<mlir::ModuleOp>(sourceMgr, &context);
   if (!module) {
     llvm::errs() << "Error can't load file " << inputFilename << "\n";
     return 3;
diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp
--- a/mlir/examples/toy/Ch7/toyc.cpp
+++ b/mlir/examples/toy/Ch7/toyc.cpp
@@ -23,7 +23,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/InitAllDialects.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
@@ -119,7 +119,7 @@
   // Parse the input mlir.
   llvm::SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
-  module = mlir::parseSourceFile(sourceMgr, &context);
+  module = mlir::parseSourceFile<mlir::ModuleOp>(sourceMgr, &context);
   if (!module) {
     llvm::errs() << "Error can't load file " << inputFilename << "\n";
     return 3;
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -239,7 +239,7 @@
 
   /// Add a bufferization state initializer that initializes the specified
   /// dialect-specific bufferization state.
-  void addDialectStateInitializer(StringRef name, DialectStateInitFn fn);
+  void addDialectStateInitializer(StringRef name, const DialectStateInitFn &fn);
 
 private:
   /// Allow a dialect.
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -397,7 +397,7 @@
 FailureOr<TileLoopNest> tileConsumerAndFuseProducers(
     OpBuilder &b, LinalgOp consumerOp, ArrayRef<int64_t> tileSizes,
     ArrayRef<int64_t> tileInterchange,
-    Optional<LinalgLoopDistributionOptions> tileDistribution);
+    const Optional<LinalgLoopDistributionOptions> &tileDistribution);
 
 //===----------------------------------------------------------------------===//
 // Generic op region utilities
diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h
@@ -0,0 +1,20 @@
+//===- BufferizableOpInterfaceImpl.h - Impl. of BufferizableOpInterface ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SHAPE_BUFFERIZABLEOPINTERFACEIMPL_H
+#define MLIR_DIALECT_SHAPE_BUFFERIZABLEOPINTERFACEIMPL_H
+
+namespace mlir {
+class DialectRegistry;
+
+namespace shape {
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+} // namespace shape
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SHAPE_BUFFERIZABLEOPINTERFACEIMPL_H
diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
--- a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
@@ -40,21 +40,6 @@
 void populateRemoveShapeConstraintsPatterns(RewritePatternSet &patterns);
 std::unique_ptr<OperationPass<FuncOp>> createRemoveShapeConstraintsPass();
 
-/// Populates patterns for shape dialect structural type conversions and sets up
-/// the provided ConversionTarget with the appropriate legality configuration
-/// for the ops to get converted properly.
-///
-/// A "structural" type conversion is one where the underlying ops are
-/// completely agnostic to the actual types involved and simply need to update
-/// their types consistently. An example of this is shape.assuming -- the
-/// shape.assuming op and the corresponding shape.assuming_yield op need to have
-/// consistent types, but the exact types don't matter. So all that we need to
-/// do for a structural type conversion is to update both of their types
-/// consistently to the new types prescribed by the TypeConverter.
-void populateShapeStructuralTypeConversionsAndLegality(
-    TypeConverter &typeConverter, RewritePatternSet &patterns,
-    ConversionTarget &target);
-
 // Bufferizes shape dialect ops.
 //
 // Note that most shape dialect ops must be converted to std before
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -726,6 +726,9 @@
   /// Always print operations in the generic form.
   OpPrintingFlags &printGenericOpForm();
 
+  /// Do not verify the operation when using custom operation printers.
+  OpPrintingFlags &assumeVerified();
+
   /// Use local scope when printing the operation. This allows for using the
   /// printer in a more localized and thread-safe setting, but may not
   /// necessarily be identical to what the IR will look like when dumping
@@ -747,6 +750,9 @@
   /// Return if operations should be printed in the generic form.
   bool shouldPrintGenericOpForm() const;
 
+  /// Return if operation verification should be skipped.
+  bool shouldAssumeVerified() const;
+
   /// Return if the printer should use local scope when dumping the IR.
   bool shouldUseLocalScope() const;
 
@@ -762,6 +768,9 @@
   /// Print operations in the generic form.
   bool printGenericOpFormFlag : 1;
 
+  /// Skip operation verification.
+  bool assumeVerifiedFlag : 1;
+
   /// Print operations with numberings local to the current operation.
   bool printLocalScope : 1;
 };
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -24,6 +24,7 @@
 class BlockArgument;
 class Operation;
 class OpOperand;
+class OpPrintingFlags;
 class OpResult;
 class Region;
 class Value;
@@ -215,6 +216,7 @@
   // Utilities
 
   void print(raw_ostream &os);
+  void print(raw_ostream &os, const OpPrintingFlags &flags);
   void print(raw_ostream &os, AsmState &state);
   void dump();
 
diff --git a/mlir/include/mlir/Parser.h b/mlir/include/mlir/Parser/Parser.h
rename from mlir/include/mlir/Parser.h
rename to mlir/include/mlir/Parser/Parser.h
--- a/mlir/include/mlir/Parser.h
+++ b/mlir/include/mlir/Parser/Parser.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_PARSER_H
-#define MLIR_PARSER_H
+#ifndef MLIR_PARSER_PARSER_H
+#define MLIR_PARSER_PARSER_H
 
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -206,21 +206,21 @@
 
 /// TODO: These methods are deprecated in favor of the above template versions.
 /// They should be removed when usages have been updated.
-inline OwningOpRef<ModuleOp> parseSourceFile(const llvm::SourceMgr &sourceMgr,
-                                             MLIRContext *context) {
+[[deprecated("use parseSourceFile<ModuleOp>")]] inline OwningOpRef<ModuleOp>
+parseSourceFile(const llvm::SourceMgr &sourceMgr, MLIRContext *context) {
   return parseSourceFile<ModuleOp>(sourceMgr, context);
 }
-inline OwningOpRef<ModuleOp> parseSourceFile(llvm::StringRef filename,
-                                             MLIRContext *context) {
+[[deprecated("use parseSourceFile<ModuleOp>")]] inline OwningOpRef<ModuleOp>
+parseSourceFile(llvm::StringRef filename, MLIRContext *context) {
   return parseSourceFile<ModuleOp>(filename, context);
 }
-inline OwningOpRef<ModuleOp> parseSourceFile(llvm::StringRef filename,
-                                             llvm::SourceMgr &sourceMgr,
-                                             MLIRContext *context) {
+[[deprecated("use parseSourceFile<ModuleOp>")]] inline OwningOpRef<ModuleOp>
+parseSourceFile(llvm::StringRef filename, llvm::SourceMgr &sourceMgr,
+                MLIRContext *context) {
   return parseSourceFile<ModuleOp>(filename, sourceMgr, context);
 }
-inline OwningOpRef<ModuleOp> parseSourceString(llvm::StringRef moduleStr,
-                                               MLIRContext *context) {
+[[deprecated("use parseSourceFile<ModuleOp>")]] inline OwningOpRef<ModuleOp>
+parseSourceString(llvm::StringRef moduleStr, MLIRContext *context) {
   return parseSourceString<ModuleOp>(moduleStr, context);
 }
 
@@ -268,4 +268,4 @@
 
 } // namespace mlir
 
-#endif // MLIR_PARSER_H
+#endif // MLIR_PARSER_PARSER_H
diff --git a/mlir/include/mlir/Support/DebugAction.h b/mlir/include/mlir/Support/DebugAction.h
--- a/mlir/include/mlir/Support/DebugAction.h
+++ b/mlir/include/mlir/Support/DebugAction.h
@@ -194,7 +194,8 @@
   class Handler : public DebugActionManager::HandlerBase {
   public:
     Handler()
-        : HandlerBase(TypeID::get<DebugAction<ParameterTs...>::Handler>()) {}
+        : HandlerBase(
+              TypeID::get<typename DebugAction<ParameterTs...>::Handler>()) {}
 
     /// This hook allows for controlling whether an action should execute or
     /// not. `parameters` correspond to the set of values provided by the
@@ -207,7 +208,7 @@
     /// Provide classof to allow casting between handler types.
     static bool classof(const DebugActionManager::HandlerBase *handler) {
       return handler->getHandlerID() ==
-             TypeID::get<DebugAction<ParameterTs...>::Handler>();
+             TypeID::get<typename DebugAction<ParameterTs...>::Handler>();
     }
   };
 
diff --git a/mlir/include/mlir/Support/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
rename from mlir/include/mlir/Support/MlirOptMain.h
rename to mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
--- a/mlir/include/mlir/Support/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_SUPPORT_MLIROPTMAIN_H
-#define MLIR_SUPPORT_MLIROPTMAIN_H
+#ifndef MLIR_TOOLS_MLIROPT_MLIROPTMAIN_H
+#define MLIR_TOOLS_MLIROPT_MLIROPTMAIN_H
 
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/StringRef.h"
@@ -95,4 +95,4 @@
 
 } // namespace mlir
 
-#endif // MLIR_SUPPORT_MLIROPTMAIN_H
+#endif // MLIR_TOOLS_MLIROPT_MLIROPTMAIN_H
diff --git a/mlir/include/mlir/Tools/mlir-translate/MlirTranslateMain.h b/mlir/include/mlir/Tools/mlir-translate/MlirTranslateMain.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Tools/mlir-translate/MlirTranslateMain.h
@@ -0,0 +1,28 @@
+//===- MlirTranslateMain.h - MLIR Translation Driver main -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Main entry function for mlir-translate for when built as standalone binary.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TOOLS_MLIRTRANSLATE_MLIRTRANSLATEMAIN_H
+#define MLIR_TOOLS_MLIRTRANSLATE_MLIRTRANSLATEMAIN_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+/// Translate to/from an MLIR module from/to an external representation (e.g.
+/// LLVM IR, SPIRV binary, ...). This is the entry point for the implementation
+/// of tools like `mlir-translate`. The translation to perform is parsed from
+/// the command line. The `toolName` argument is used for the header displayed
+/// by `--help`.
+LogicalResult mlirTranslateMain(int argc, char **argv, StringRef toolName);
+} // namespace mlir
+
+#endif // MLIR_TOOLS_MLIRTRANSLATE_MLIRTRANSLATEMAIN_H
diff --git a/mlir/include/mlir/Translation.h b/mlir/include/mlir/Tools/mlir-translate/Translation.h
rename from mlir/include/mlir/Translation.h
rename to mlir/include/mlir/Tools/mlir-translate/Translation.h
--- a/mlir/include/mlir/Translation.h
+++ b/mlir/include/mlir/Tools/mlir-translate/Translation.h
@@ -9,8 +9,9 @@
 // Registry for user-provided translations.
 //
 //===----------------------------------------------------------------------===//
-#ifndef MLIR_TRANSLATION_H
-#define MLIR_TRANSLATION_H
+
+#ifndef MLIR_TOOLS_MLIRTRANSLATE_TRANSLATION_H
+#define MLIR_TOOLS_MLIRTRANSLATE_TRANSLATION_H
 
 #include "llvm/Support/CommandLine.h"
 
@@ -96,14 +97,6 @@
                        size_t globalWidth) const override;
 };
 
-/// Translate to/from an MLIR module from/to an external representation (e.g.
-/// LLVM IR, SPIRV binary, ...). This is the entry point for the implementation
-/// of tools like `mlir-translate`. The translation to perform is parsed from
-/// the command line. The `toolName` argument is used for the header displayed
-/// by `--help`.
-LogicalResult mlirTranslateMain(int argc, char **argv,
-                                llvm::StringRef toolName);
-
 } // namespace mlir
 
-#endif // MLIR_TRANSLATION_H
+#endif // MLIR_TOOLS_MLIRTRANSLATE_TRANSLATION_H
diff --git a/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp b/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
--- a/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
+++ b/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
@@ -188,7 +188,7 @@
     os << "Dimension: " << getNumDomainIds() << ", ";
   }
   os << "Symbols: " << getNumSymbolIds() << ", "
-     << "Locals" << getNumLocalIds() << "\n";
+     << "Locals: " << getNumLocalIds() << "\n";
 }
 
 void PresburgerLocalSpace::dump() const { print(llvm::errs()); }
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -20,7 +20,7 @@
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 
 #include "llvm/Support/Debug.h"
 #include <cstddef>
diff --git a/mlir/lib/CMakeLists.txt b/mlir/lib/CMakeLists.txt
--- a/mlir/lib/CMakeLists.txt
+++ b/mlir/lib/CMakeLists.txt
@@ -15,7 +15,6 @@
 add_subdirectory(Target)
 add_subdirectory(Tools)
 add_subdirectory(Transforms)
-add_subdirectory(Translation)
 
 # Only enable the ExecutionEngine if the native target is configured in.
 if(TARGET ${LLVM_NATIVE_ARCH})
diff --git a/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt b/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
--- a/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
@@ -15,5 +15,5 @@
   MLIRSPIRVSerialization
   MLIRSupport
   MLIRTransforms
-  MLIRTranslation
+  MLIRTranslateLib
   )
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -618,6 +618,7 @@
       AffineMap::get(origLbMap.getNumDims() + origUbMap.getNumDims(),
                      origLbMap.getNumSymbols() + origUbMap.getNumSymbols(),
                      newUbExprs, opBuilder.getContext());
+  canonicalizeMapAndOperands(&newUbMap, &ubOperands);
 
   // Normalize the loop.
   op.setUpperBound(ubOperands, newUbMap);
@@ -640,6 +641,7 @@
   AffineExpr newIVExpr = origIVExpr * origLoopStep + origLbMap.getResult(0);
   AffineMap ivMap = AffineMap::get(origLbMap.getNumDims() + 1,
                                    origLbMap.getNumSymbols(), newIVExpr);
+  canonicalizeMapAndOperands(&ivMap, &lbOperands);
   Operation *newIV = opBuilder.create<AffineApplyOp>(loc, ivMap, lbOperands);
   op.getInductionVar().replaceAllUsesExcept(newIV->getResult(0), newIV);
   return success();
diff --git a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
--- a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
+++ b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
@@ -1862,8 +1862,8 @@
 OpFoldResult arith::ShLIOp::fold(ArrayRef<Attribute> operands) {
   // Don't fold if shifting more than the bit width.
   bool bounded = false;
-  auto result =
-      constFoldBinaryOp<IntegerAttr>(operands, [&](APInt a, const APInt &b) {
+  auto result = constFoldBinaryOp<IntegerAttr>(
+      operands, [&](const APInt &a, const APInt &b) {
         bounded = b.ule(b.getBitWidth());
         return std::move(a).shl(b);
       });
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -64,8 +64,8 @@
   return nullptr;
 }
 
-void BufferizationOptions::addDialectStateInitializer(StringRef name,
-                                                      DialectStateInitFn fn) {
+void BufferizationOptions::addDialectStateInitializer(
+    StringRef name, const DialectStateInitFn &fn) {
   stateInitializers.push_back(
       [=](BufferizationState &state) { state.insertDialectState(name, fn()); });
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -22,7 +22,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/SymbolTable.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/RegionUtils.h"
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
--- a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
@@ -16,7 +16,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/FunctionInterfaces.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/InliningUtils.h"
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -22,7 +22,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
@@ -873,7 +873,7 @@
     // Get the `sourceShape` of the `sourceType`. If the operand is a result of
     // `tensor.cast` operation and source of the cast operation has a static
     // shape, then assign it to the `sourceShape`.
-    auto parentOp = src.getDefiningOp();
+    auto *parentOp = src.getDefiningOp();
     ArrayRef<int64_t> sourceShape = sourceType.getShape();
     if (parentOp) {
       if (auto castOp = dyn_cast<tensor::CastOp>(parentOp)) {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
@@ -40,7 +40,7 @@
       const LinalgComprehensiveModuleBufferize &p) = default;
 
   explicit LinalgComprehensiveModuleBufferize(
-      AnalysisBufferizationOptions options)
+      const AnalysisBufferizationOptions &options)
       : options(options) {}
 
   void runOnOperation() override;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp b/mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp
@@ -415,7 +415,7 @@
 FailureOr<TileLoopNest> mlir::linalg::tileConsumerAndFuseProducers(
     OpBuilder &b, LinalgOp consumerOp, ArrayRef<int64_t> tileSizes,
     ArrayRef<int64_t> tileInterchange,
-    Optional<LinalgLoopDistributionOptions> tileDistribution) {
+    const Optional<LinalgLoopDistributionOptions> &tileDistribution) {
   assert(tileSizes.size() == tileInterchange.size() &&
          "expect the number of tile sizes and interchange dims to match");
   assert(isPermutation(tileInterchange) &&
diff --git a/mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp
@@ -35,8 +35,8 @@
   if (auto enc = getSparseTensorEncoding(op->get().getType())) {
     ArrayRef<SparseTensorEncodingAttr::DimLevelType> dimTypes =
         enc.getDimLevelType();
-    for (unsigned i = 0, e = dimTypes.size(); i < e; i++)
-      if (dimTypes[i] == SparseTensorEncodingAttr::DimLevelType::Compressed)
+    for (auto dimType : dimTypes)
+      if (dimType == SparseTensorEncodingAttr::DimLevelType::Compressed)
         return true; // at least one compressed
   }
   return false;
@@ -52,7 +52,7 @@
 // Helper to detect sampling operation.
 static bool isSampling(GenericOp op) {
   auto yieldOp = cast<linalg::YieldOp>(op.region().front().getTerminator());
-  if (auto def = yieldOp.getOperand(0).getDefiningOp()) {
+  if (auto *def = yieldOp.getOperand(0).getDefiningOp()) {
     if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def)) {
       // Both scalar input arguments used exactly once.
       Value s1 = op.getBlock()->getArgument(0);
@@ -68,7 +68,7 @@
 static bool isMulChain(Value val, Value x) {
   if (auto arg = val.dyn_cast<BlockArgument>())
     return arg != x;
-  if (auto def = val.getDefiningOp()) {
+  if (auto *def = val.getDefiningOp()) {
     if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def))
       return isMulChain(def->getOperand(0), x) &&
              isMulChain(def->getOperand(1), x);
@@ -79,7 +79,7 @@
 // Helper to detect x = x + <multiplications>.
 static bool isSumOfMul(GenericOp op) {
   auto yieldOp = cast<linalg::YieldOp>(op.region().front().getTerminator());
-  if (auto def = yieldOp.getOperand(0).getDefiningOp()) {
+  if (auto *def = yieldOp.getOperand(0).getDefiningOp()) {
     if (isa<arith::AddFOp>(def) || isa<arith::AddIOp>(def)) {
       Value x = op.getBlock()->getArguments().back();
       return (def->getOperand(0) == x && isMulChain(def->getOperand(1), x)) ||
@@ -165,8 +165,8 @@
     addArg(mapper, fusedBlock, consBlock.getArgument(1 - other));
     addArg(mapper, fusedBlock, prodBlock.getArgument(num - 1));
     // Clone bodies of the producer and consumer in new evaluation order.
-    auto acc = prodBlock.getTerminator()->getOperand(0).getDefiningOp();
-    auto sampler = consBlock.getTerminator()->getOperand(0).getDefiningOp();
+    auto *acc = prodBlock.getTerminator()->getOperand(0).getDefiningOp();
+    auto *sampler = consBlock.getTerminator()->getOperand(0).getDefiningOp();
     rewriter.setInsertionPointToStart(fusedBlock);
     Value last;
     for (auto &op : prodBlock.without_terminator())
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1355,8 +1355,8 @@
 struct Conv1DNwcGenerator : public StructuredGenerator<LinalgOp> {
   Conv1DNwcGenerator(OpBuilder &builder, LinalgOp linalgOp, int strideW,
                      int dilationW)
-      : StructuredGenerator<LinalgOp>(builder, linalgOp), valid(false),
-        strideW(strideW), dilationW(dilationW) {
+      : StructuredGenerator<LinalgOp>(builder, linalgOp), strideW(strideW),
+        dilationW(dilationW) {
     // Determine whether `linalgOp` can be generated with this generator
     if (linalgOp.getNumInputs() != 2 || linalgOp.getNumOutputs() != 1)
       return;
@@ -1665,7 +1665,7 @@
   }
 
 private:
-  bool valid;
+  bool valid = false;
   int strideW, dilationW;
   Value lhsShaped, rhsShaped, resShaped;
   ShapedType lhsShapedType, rhsShapedType, resShapedType;
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -57,7 +57,7 @@
 //   `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
 //
 struct TileCheck : public AffineExprVisitor<TileCheck> {
-  TileCheck(ValueRange tileSizes) : isTiled(false), tileSizes(tileSizes) {}
+  TileCheck(ValueRange tileSizes) : tileSizes(tileSizes) {}
 
   void visitDimExpr(AffineDimExpr expr) {
     isTiled |= !isZero(tileSizes[expr.getPosition()]);
@@ -69,7 +69,7 @@
       assert(expr.getRHS().cast<AffineConstantExpr>().getValue() > 0 &&
              "nonpositive multiplying coefficient");
   }
-  bool isTiled;
+  bool isTiled = false;
   ValueRange tileSizes;
 };
 
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -400,11 +400,11 @@
       return WalkResult::advance();
     });
 
-    if (!toHoist.size())
+    if (toHoist.empty())
       return failure();
     rewriter.setInsertionPoint(lastParentWithoutScope);
-    for (auto op : toHoist) {
-      auto cloned = rewriter.clone(*op);
+    for (auto *op : toHoist) {
+      auto *cloned = rewriter.clone(*op);
       rewriter.replaceOp(op, cloned->getResults());
     }
     return success();
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -19,7 +19,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Sequence.h"
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -469,8 +469,8 @@
     SmallVector<Value> operands;
 
     for (Value operand : op.getInputs()) {
-      if (auto assume_all = operand.getDefiningOp<AssumingAllOp>())
-        operands.append(assume_all.operand_begin(), assume_all->operand_end());
+      if (auto assumeAll = operand.getDefiningOp<AssumingAllOp>())
+        operands.append(assumeAll.operand_begin(), assumeAll->operand_end());
       else
         operands.push_back(operand);
     }
@@ -530,8 +530,8 @@
     // Collect shapes checked by `cstr_broadcastable` operands.
     SmallVector<std::pair<CstrBroadcastableOp, DenseSet<Value>>> shapes;
     for (auto cstr : operands) {
-      DenseSet<Value> shapes_set(cstr->operand_begin(), cstr->operand_end());
-      shapes.emplace_back(cstr, std::move(shapes_set));
+      DenseSet<Value> shapesSet(cstr->operand_begin(), cstr->operand_end());
+      shapes.emplace_back(cstr, std::move(shapesSet));
     }
 
     // Sort by the number of shape operands (larger to smaller).
@@ -543,7 +543,7 @@
     // shape operands, and remove redundant `cst_broadcastable` operations. We
     // do this until we find a set of `cst_broadcastable` operations with
     // non-overlapping constraints.
-    SmallVector<CstrBroadcastableOp> marked_for_erase;
+    SmallVector<CstrBroadcastableOp> markedForErase;
 
     for (unsigned i = 0; i < shapes.size(); ++i) {
       auto isSubset = [&](auto pair) {
@@ -553,24 +553,24 @@
       // Keep redundant `cstr_broadcastable` operations to be erased.
       auto *it = std::remove_if(shapes.begin() + i + 1, shapes.end(), isSubset);
       for (auto *it0 = it; it0 < shapes.end(); ++it0)
-        marked_for_erase.push_back(it0->first);
+        markedForErase.push_back(it0->first);
       shapes.erase(it, shapes.end());
     }
 
     // We didn't find any operands that could be removed.
-    if (marked_for_erase.empty())
+    if (markedForErase.empty())
       return failure();
 
     // Collect non-overlapping `cst_broadcastable` constraints.
-    SmallVector<Value> unique_constraints;
+    SmallVector<Value> uniqueConstraints;
     for (auto &shape : shapes)
-      unique_constraints.push_back(shape.first.getResult());
+      uniqueConstraints.push_back(shape.first.getResult());
 
     // Replace with a new `assuming_all` operation ...
-    rewriter.replaceOpWithNewOp<AssumingAllOp>(op, unique_constraints);
+    rewriter.replaceOpWithNewOp<AssumingAllOp>(op, uniqueConstraints);
 
     // ... and maybe erase `cstr_broadcastable` ops without uses.
-    for (auto &op : marked_for_erase)
+    for (auto &op : markedForErase)
       if (op->use_empty())
         rewriter.eraseOp(op);
 
diff --git a/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -0,0 +1,169 @@
+//===- BufferizableOpInterfaceImpl.cpp - Impl. of BufferizableOpInterface -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h"
+
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+using namespace mlir::bufferization;
+using namespace mlir::shape;
+
+namespace mlir {
+namespace shape {
+namespace {
+
+/// Bufferization of shape.assuming.
+struct AssumingOpInterface
+    : public BufferizableOpInterface::ExternalModel<AssumingOpInterface,
+                                                    shape::AssumingOp> {
+  SmallVector<OpOperand *>
+  getAliasingOpOperand(Operation *op, OpResult opResult,
+                       const BufferizationState &state) const {
+    // AssumingOps do not have tensor OpOperands. The yielded value can be any
+    // SSA value that is in scope. To allow for use-def chain traversal through
+    // AssumingOps in the analysis, the corresponding yield value is considered
+    // to be aliasing with the result.
+    auto assumingOp = cast<shape::AssumingOp>(op);
+    size_t resultNum = std::distance(op->getOpResults().begin(),
+                                     llvm::find(op->getOpResults(), opResult));
+    // TODO: Support multiple blocks.
+    assert(assumingOp.getDoRegion().getBlocks().size() == 1 &&
+           "expected exactly 1 block");
+    auto yieldOp = dyn_cast<shape::AssumingYieldOp>(
+        assumingOp.getDoRegion().front().getTerminator());
+    assert(yieldOp && "expected shape.assuming_yield terminator");
+    return {&yieldOp->getOpOperand(resultNum)};
+  }
+
+  // TODO: For better bufferization results, this could return `true` only if
+  // there is a memory write in the region.
+  bool isMemoryWrite(Operation *op, OpResult opResult,
+                     const BufferizationState &state) const {
+    // Similar to scf.if, results of this op are always considered memory writes
+    // in the analysis. This is a useful pattern for all ops that have tensor
+    // OpResults but no tensor OpOperands. By default, `isMemoryWrite` is
+    // implemented in terms of `bufferizesToMemoryWrite`, which does not work on
+    // ops without OpOperands.
+    return true;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationState &state) const {
+    auto assumingOp = cast<shape::AssumingOp>(op);
+
+    // Compute new result types.
+    SmallVector<Type> newResultTypes;
+    for (Type type : assumingOp->getResultTypes()) {
+      if (auto tensorType = type.dyn_cast<TensorType>()) {
+        newResultTypes.push_back(getMemRefType(tensorType, state.getOptions()));
+      } else {
+        newResultTypes.push_back(type);
+      }
+    }
+
+    // Create new op and move over region.
+    auto newOp = rewriter.create<shape::AssumingOp>(
+        op->getLoc(), newResultTypes, assumingOp.getWitness());
+    newOp.getDoRegion().takeBody(assumingOp.getRegion());
+
+    // Update terminator.
+    assert(newOp.getDoRegion().getBlocks().size() == 1 &&
+           "only 1 block supported");
+    Block *newBlock = &newOp.getDoRegion().front();
+    auto yieldOp = cast<shape::AssumingYieldOp>(newBlock->getTerminator());
+    rewriter.setInsertionPoint(yieldOp);
+    SmallVector<Value> newYieldValues;
+    for (const auto &it : llvm::enumerate(yieldOp.operands())) {
+      Value val = it.value();
+      if (val.getType().isa<TensorType>()) {
+        newYieldValues.push_back(rewriter.create<bufferization::ToMemrefOp>(
+            yieldOp.getLoc(), newResultTypes[it.index()], val));
+      } else {
+        newYieldValues.push_back(val);
+      }
+    }
+    rewriter.replaceOpWithNewOp<shape::AssumingYieldOp>(yieldOp,
+                                                        newYieldValues);
+
+    // Update all uses of the old op.
+    rewriter.setInsertionPointAfter(newOp);
+    SmallVector<Value> newResults;
+    for (const auto &it : llvm::enumerate(assumingOp->getResultTypes())) {
+      if (it.value().isa<TensorType>()) {
+        newResults.push_back(rewriter.create<bufferization::ToTensorOp>(
+            assumingOp.getLoc(), newOp->getResult(it.index())));
+      } else {
+        newResults.push_back(newOp->getResult(it.index()));
+      }
+    }
+
+    // Replace old op.
+    rewriter.replaceOp(assumingOp, newResults);
+
+    return success();
+  }
+
+  BufferRelation bufferRelation(Operation *op, OpResult opResult,
+                                const BufferizationState &state) const {
+    return BufferRelation::Equivalent;
+  }
+};
+
+/// Bufferization of shape.assuming_yield. Bufferized as part of their enclosing
+/// ops, so this is for analysis only.
+struct AssumingYieldOpInterface
+    : public BufferizableOpInterface::ExternalModel<AssumingYieldOpInterface,
+                                                    shape::AssumingOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const BufferizationState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const BufferizationState &state) const {
+    return false;
+  }
+
+  SmallVector<OpResult>
+  getAliasingOpResult(Operation *op, OpOperand &opOperand,
+                      const BufferizationState &state) const {
+    assert(isa<shape::AssumingOp>(op->getParentOp()) &&
+           "expected that parent is an AssumingOp");
+    return {op->getParentOp()->getResult(opOperand.getOperandNumber())};
+  }
+
+  bool mustBufferizeInPlace(Operation *op, OpOperand &opOperand,
+                            const BufferizationState &state) const {
+    // Yield operands always bufferize inplace. Otherwise, an alloc + copy
+    // may be generated inside the block. We should not return/yield allocations
+    // when possible.
+    return true;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationState &state) const {
+    // Op is bufferized as part of AssumingOp.
+    return failure();
+  }
+};
+
+} // namespace
+} // namespace shape
+} // namespace mlir
+
+void mlir::shape::registerBufferizableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addOpInterface<shape::AssumingOp, AssumingOpInterface>();
+  registry.addOpInterface<shape::AssumingYieldOp, AssumingYieldOpInterface>();
+}
diff --git a/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
--- a/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
@@ -8,30 +8,32 @@
 
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "PassDetail.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Shape/Transforms/Passes.h"
 #include "mlir/Pass/Pass.h"
 
 using namespace mlir;
+using namespace bufferization;
 
 namespace {
 struct ShapeBufferizePass : public ShapeBufferizeBase<ShapeBufferizePass> {
   void runOnOperation() override {
-    MLIRContext &ctx = getContext();
+    BufferizationOptions options = getPartialBufferizationOptions();
+    options.allowDialectInFilter<shape::ShapeDialect>();
 
-    RewritePatternSet patterns(&ctx);
-    bufferization::BufferizeTypeConverter typeConverter;
-    ConversionTarget target(ctx);
-
-    bufferization::populateBufferizeMaterializationLegality(target);
-    populateShapeStructuralTypeConversionsAndLegality(typeConverter, patterns,
-                                                      target);
-
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns))))
+    if (failed(bufferizeOp(getOperation(), options)))
       signalPassFailure();
   }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
+                    shape::ShapeDialect>();
+    shape::registerBufferizableOpInterfaceExternalModels(registry);
+  }
 };
 } // namespace
 
diff --git a/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
@@ -1,8 +1,8 @@
 add_mlir_dialect_library(MLIRShapeOpsTransforms
+  BufferizableOpInterfaceImpl.cpp
   Bufferize.cpp
   RemoveShapeConstraints.cpp
   ShapeToShapeLowering.cpp
-  StructuralTypeConversions.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/ShapeOps/Transforms
@@ -14,6 +14,7 @@
 target_link_libraries(MLIRShapeOpsTransforms
   PUBLIC
   MLIRArithmetic
+  MLIRBufferization
   MLIRBufferizationTransforms
   MLIRIR
   MLIRMemRef
diff --git a/mlir/lib/Dialect/Shape/Transforms/StructuralTypeConversions.cpp b/mlir/lib/Dialect/Shape/Transforms/StructuralTypeConversions.cpp
deleted file mode 100644
--- a/mlir/lib/Dialect/Shape/Transforms/StructuralTypeConversions.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//===- StructuralTypeConversions.cpp - Shape structural type conversions --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "PassDetail.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Shape/Transforms/Passes.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-using namespace mlir;
-using namespace mlir::shape;
-
-namespace {
-class ConvertAssumingOpTypes : public OpConversionPattern<AssumingOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(AssumingOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    SmallVector<Type, 2> newResultTypes;
-    newResultTypes.reserve(op.getNumResults());
-    for (auto result : op.getResults()) {
-      auto originalType = result.getType();
-      Type convertedType = getTypeConverter()->convertType(originalType);
-      newResultTypes.push_back(convertedType);
-    }
-
-    auto newAssumingOp = rewriter.create<AssumingOp>(
-        op.getLoc(), newResultTypes, op.getWitness());
-    rewriter.inlineRegionBefore(op.getDoRegion(), newAssumingOp.getDoRegion(),
-                                newAssumingOp.getDoRegion().end());
-    rewriter.replaceOp(op, newAssumingOp.getResults());
-
-    return success();
-  }
-};
-} // namespace
-
-namespace {
-class ConvertAssumingYieldOpTypes
-    : public OpConversionPattern<AssumingYieldOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(AssumingYieldOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    rewriter.replaceOpWithNewOp<AssumingYieldOp>(op, adaptor.getOperands());
-    return success();
-  }
-};
-} // namespace
-
-void mlir::populateShapeStructuralTypeConversionsAndLegality(
-    TypeConverter &typeConverter, RewritePatternSet &patterns,
-    ConversionTarget &target) {
-  patterns.add<ConvertAssumingOpTypes, ConvertAssumingYieldOpTypes>(
-      typeConverter, patterns.getContext());
-  target.addDynamicallyLegalOp<AssumingOp>([&](AssumingOp op) {
-    return typeConverter.isLegal(op.getResultTypes());
-  });
-  target.addDynamicallyLegalOp<AssumingYieldOp>([&](AssumingYieldOp op) {
-    return typeConverter.isLegal(op.getOperandTypes());
-  });
-}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -52,10 +52,9 @@
         indices(numTensors, std::vector<Value>(numLoops)),
         highs(numTensors, std::vector<Value>(numLoops)),
         pidxs(numTensors, std::vector<Value>(numLoops)),
-        idxs(numTensors, std::vector<Value>(numLoops)), redExp(-1u), redVal(),
-        redKind(kNoReduc), sparseOut(op), outerParNest(nest), lexIdx(),
-        expValues(), expFilled(), expAdded(), expCount(), curVecLength(1),
-        curVecMask() {}
+        idxs(numTensors, std::vector<Value>(numLoops)), redVal(), sparseOut(op),
+        outerParNest(nest), lexIdx(), expValues(), expFilled(), expAdded(),
+        expCount(), curVecMask() {}
   /// Sparsification options.
   SparsificationOptions options;
   /// Universal dense indices and upper bounds (by index). The loops array
@@ -77,9 +76,9 @@
   std::vector<std::vector<Value>> idxs;
   /// Current reduction, updated during code generation. When indices of a
   /// reduction are exhausted, all inner loops can use a scalarized reduction.
-  unsigned redExp;
+  unsigned redExp = -1u;
   Value redVal;
-  Reduction redKind;
+  Reduction redKind = kNoReduc;
   // Sparse tensor as output. Implemented either through direct injective
   // insertion in lexicographic index order (where indices are updated
   // in the temporary array `lexIdx`) or through access pattern expansion
@@ -92,7 +91,7 @@
   Value expAdded;
   Value expCount;
   // Current vector length and mask.
-  unsigned curVecLength;
+  unsigned curVecLength = 1;
   Value curVecMask;
 };
 
diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@@ -21,7 +21,7 @@
 #include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Support/FileUtilities.h"
 
 #include "llvm/ADT/STLExtras.h"
@@ -122,7 +122,7 @@
 
   llvm::SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(file), SMLoc());
-  return OwningOpRef<ModuleOp>(parseSourceFile(sourceMgr, context));
+  return parseSourceFile<ModuleOp>(sourceMgr, context);
 }
 
 static inline Error makeStringError(const Twine &message) {
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -328,9 +328,7 @@
          "unexpected opKind");
   switch (expr.getKind()) {
   case AffineExprKind::Constant:
-    if (expr.cast<AffineConstantExpr>().getValue())
-      return false;
-    return true;
+    return expr.cast<AffineConstantExpr>().getValue() == 0;
   case AffineExprKind::DimId:
     return false;
   case AffineExprKind::SymbolId:
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -25,6 +25,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/SubElementInterfaces.h"
+#include "mlir/IR/Verifier.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
@@ -40,6 +41,7 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/SaveAndRestore.h"
+#include "llvm/Support/Threading.h"
 
 #include <tuple>
 
@@ -141,6 +143,11 @@
       "mlir-print-op-generic", llvm::cl::init(false),
       llvm::cl::desc("Print the generic op form"), llvm::cl::Hidden};
 
+  llvm::cl::opt<bool> assumeVerifiedOpt{
+      "mlir-print-assume-verified", llvm::cl::init(false),
+      llvm::cl::desc("Skip op verification when using custom printers"),
+      llvm::cl::Hidden};
+
   llvm::cl::opt<bool> printLocalScopeOpt{
       "mlir-print-local-scope", llvm::cl::init(false),
       llvm::cl::desc("Print with local scope and inline information (eliding "
@@ -160,7 +167,8 @@
 /// Initialize the printing flags with default supplied by the cl::opts above.
 OpPrintingFlags::OpPrintingFlags()
     : printDebugInfoFlag(false), printDebugInfoPrettyFormFlag(false),
-      printGenericOpFormFlag(false), printLocalScope(false) {
+      printGenericOpFormFlag(false), assumeVerifiedFlag(false),
+      printLocalScope(false) {
   // Initialize based upon command line options, if they are available.
   if (!clOptions.isConstructed())
     return;
@@ -169,6 +177,7 @@
   printDebugInfoFlag = clOptions->printDebugInfoOpt;
   printDebugInfoPrettyFormFlag = clOptions->printPrettyDebugInfoOpt;
   printGenericOpFormFlag = clOptions->printGenericOpFormOpt;
+  assumeVerifiedFlag = clOptions->assumeVerifiedOpt;
   printLocalScope = clOptions->printLocalScopeOpt;
 }
 
@@ -196,6 +205,12 @@
   return *this;
 }
 
+/// Do not verify the operation when using custom operation printers.
+OpPrintingFlags &OpPrintingFlags::assumeVerified() {
+  assumeVerifiedFlag = true;
+  return *this;
+}
+
 /// Use local scope when printing the operation. This allows for using the
 /// printer in a more localized and thread-safe setting, but may not necessarily
 /// be identical of what the IR will look like when dumping the full module.
@@ -231,6 +246,11 @@
   return printGenericOpFormFlag;
 }
 
+/// Return if operation verification should be skipped.
+bool OpPrintingFlags::shouldAssumeVerified() const {
+  return assumeVerifiedFlag;
+}
+
 /// Return if the printer should use local scope when dumping the IR.
 bool OpPrintingFlags::shouldUseLocalScope() const { return printLocalScope; }
 
@@ -1245,9 +1265,31 @@
 } // namespace detail
 } // namespace mlir
 
+/// Verifies the operation and switches to generic op printing if verification
+/// fails. We need to do this because custom print functions may fail for
+/// invalid ops.
+static OpPrintingFlags verifyOpAndAdjustFlags(Operation *op,
+                                              OpPrintingFlags printerFlags) {
+  if (printerFlags.shouldPrintGenericOpForm() ||
+      printerFlags.shouldAssumeVerified())
+    return printerFlags;
+
+  // Ignore errors emitted by the verifier. We check the thread id to avoid
+  // consuming other threads' errors.
+  auto parentThreadId = llvm::get_threadid();
+  ScopedDiagnosticHandler diagHandler(op->getContext(), [&](Diagnostic &) {
+    return success(parentThreadId == llvm::get_threadid());
+  });
+  if (failed(verify(op)))
+    printerFlags.printGenericOpForm();
+
+  return printerFlags;
+}
+
 AsmState::AsmState(Operation *op, const OpPrintingFlags &printerFlags,
                    LocationMap *locationMap)
-    : impl(std::make_unique<AsmStateImpl>(op, printerFlags, locationMap)) {}
+    : impl(std::make_unique<AsmStateImpl>(
+          op, verifyOpAndAdjustFlags(op, printerFlags), locationMap)) {}
 AsmState::~AsmState() = default;
 
 const OpPrintingFlags &AsmState::getPrinterFlags() const {
@@ -2853,14 +2895,15 @@
   AsmPrinter::Impl(os).printIntegerSet(*this);
 }
 
-void Value::print(raw_ostream &os) {
+void Value::print(raw_ostream &os) { print(os, OpPrintingFlags()); }
+void Value::print(raw_ostream &os, const OpPrintingFlags &flags) {
   if (!impl) {
     os << "<<NULL VALUE>>";
     return;
   }
 
   if (auto *op = getDefiningOp())
-    return op->print(os);
+    return op->print(os, flags);
   // TODO: Improve BlockArgument print'ing.
   BlockArgument arg = this->cast<BlockArgument>();
   os << "<block argument> of type '" << arg.getType()
diff --git a/mlir/lib/IR/Diagnostics.cpp b/mlir/lib/IR/Diagnostics.cpp
--- a/mlir/lib/IR/Diagnostics.cpp
+++ b/mlir/lib/IR/Diagnostics.cpp
@@ -121,6 +121,17 @@
   return *this;
 }
 
+/// Adjusts operation printing flags used in diagnostics for the given severity
+/// level.
+static OpPrintingFlags adjustPrintingFlags(OpPrintingFlags flags,
+                                           DiagnosticSeverity severity) {
+  flags.useLocalScope();
+  flags.elideLargeElementsAttrs();
+  if (severity == DiagnosticSeverity::Error)
+    flags.printGenericOpForm();
+  return flags;
+}
+
 /// Stream in an Operation.
 Diagnostic &Diagnostic::operator<<(Operation &val) {
   return appendOp(val, OpPrintingFlags());
@@ -128,8 +139,7 @@
 Diagnostic &Diagnostic::appendOp(Operation &val, const OpPrintingFlags &flags) {
   std::string str;
   llvm::raw_string_ostream os(str);
-  val.print(os,
-            OpPrintingFlags(flags).useLocalScope().elideLargeElementsAttrs());
+  val.print(os, adjustPrintingFlags(flags, severity));
   return *this << os.str();
 }
 
@@ -137,7 +147,7 @@
 Diagnostic &Diagnostic::operator<<(Value val) {
   std::string str;
   llvm::raw_string_ostream os(str);
-  val.print(os);
+  val.print(os, adjustPrintingFlags(OpPrintingFlags(), severity));
   return *this << os.str();
 }
 
@@ -844,7 +854,7 @@
     Diagnostic diag;
   };
 
-  ParallelDiagnosticHandlerImpl(MLIRContext *ctx) : handlerID(0), context(ctx) {
+  ParallelDiagnosticHandlerImpl(MLIRContext *ctx) : context(ctx) {
     handlerID = ctx->getDiagEngine().registerHandler([this](Diagnostic &diag) {
       uint64_t tid = llvm::get_threadid();
       llvm::sys::SmartScopedLock<true> lock(mutex);
@@ -942,7 +952,7 @@
   mutable std::vector<ThreadDiagnostic> diagnostics;
 
   /// The unique id for the parallel handler.
-  DiagnosticEngine::HandlerID handlerID;
+  DiagnosticEngine::HandlerID handlerID = 0;
 
   /// The context to emit the diagnostics to.
   MLIRContext *context;
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -1097,6 +1097,8 @@
           // Check that any value that is used by an operation is defined in the
           // same region as either an operation result.
           auto *operandRegion = operand.getParentRegion();
+          if (!operandRegion)
+            return op.emitError("operation's operand is unlinked");
           if (!region.isAncestor(operandRegion)) {
             return op.emitOpError("using value defined outside the region")
                        .attachNote(isolatedOp->getLoc())
diff --git a/mlir/lib/Parser/AffineParser.cpp b/mlir/lib/Parser/AffineParser.cpp
--- a/mlir/lib/Parser/AffineParser.cpp
+++ b/mlir/lib/Parser/AffineParser.cpp
@@ -48,7 +48,7 @@
   AffineParser(ParserState &state, bool allowParsingSSAIds = false,
                function_ref<ParseResult(bool)> parseElement = nullptr)
       : Parser(state), allowParsingSSAIds(allowParsingSSAIds),
-        parseElement(parseElement), numDimOperands(0), numSymbolOperands(0) {}
+        parseElement(parseElement) {}
 
   AffineMap parseAffineMapRange(unsigned numDims, unsigned numSymbols);
   ParseResult parseAffineMapOrIntegerSetInline(AffineMap &map, IntegerSet &set);
@@ -92,8 +92,8 @@
 private:
   bool allowParsingSSAIds;
   function_ref<ParseResult(bool)> parseElement;
-  unsigned numDimOperands;
-  unsigned numSymbolOperands;
+  unsigned numDimOperands = 0;
+  unsigned numSymbolOperands = 0;
   SmallVector<std::pair<StringRef, AffineExpr>, 4> dimsAndSymbols;
 };
 } // namespace
diff --git a/mlir/lib/Parser/Lexer.h b/mlir/lib/Parser/Lexer.h
--- a/mlir/lib/Parser/Lexer.h
+++ b/mlir/lib/Parser/Lexer.h
@@ -14,7 +14,7 @@
 #define MLIR_LIB_PARSER_LEXER_H
 
 #include "Token.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 
 namespace mlir {
 class Location;
diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp
--- a/mlir/lib/Parser/Parser.cpp
+++ b/mlir/lib/Parser/Parser.cpp
@@ -16,8 +16,8 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
 #include "mlir/Parser/AsmParserState.h"
+#include "mlir/Parser/Parser.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringSet.h"
diff --git a/mlir/lib/Pass/PassRegistry.cpp b/mlir/lib/Pass/PassRegistry.cpp
--- a/mlir/lib/Pass/PassRegistry.cpp
+++ b/mlir/lib/Pass/PassRegistry.cpp
@@ -326,11 +326,11 @@
   /// the name is the name of a pass, the InnerPipeline is empty, since passes
   /// cannot contain inner pipelines.
   struct PipelineElement {
-    PipelineElement(StringRef name) : name(name), registryEntry(nullptr) {}
+    PipelineElement(StringRef name) : name(name) {}
 
     StringRef name;
     StringRef options;
-    const PassRegistryEntry *registryEntry;
+    const PassRegistryEntry *registryEntry = nullptr;
     std::vector<PipelineElement> innerPipeline;
   };
 
diff --git a/mlir/lib/Support/CMakeLists.txt b/mlir/lib/Support/CMakeLists.txt
--- a/mlir/lib/Support/CMakeLists.txt
+++ b/mlir/lib/Support/CMakeLists.txt
@@ -3,7 +3,6 @@
   FileUtilities.cpp
   IndentedOstream.cpp
   InterfaceSupport.cpp
-  MlirOptMain.cpp
   StorageUniquer.cpp
   Timing.cpp
   ToolUtilities.cpp
@@ -24,18 +23,6 @@
   LINK_LIBS PUBLIC
   ${LLVM_PTHREAD_LIB})
 
-add_mlir_library(MLIROptLib
-  MlirOptMain.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Support
-
-  LINK_LIBS PUBLIC
-  MLIRPass
-  MLIRParser
-  MLIRSupport
-  )
-
 # This doesn't use add_mlir_library as it is used in mlir-tblgen and else
 # mlir-tblgen ends up depending on mlir-generic-headers.
 add_llvm_library(MLIRSupportIndentedOstream
diff --git a/mlir/lib/Target/Cpp/CMakeLists.txt b/mlir/lib/Target/Cpp/CMakeLists.txt
--- a/mlir/lib/Target/Cpp/CMakeLists.txt
+++ b/mlir/lib/Target/Cpp/CMakeLists.txt
@@ -14,5 +14,5 @@
   MLIRMath
   MLIRSCF
   MLIRSupport
-  MLIRTranslation
+  MLIRTranslateLib
   )
diff --git a/mlir/lib/Target/Cpp/TranslateRegistration.cpp b/mlir/lib/Target/Cpp/TranslateRegistration.cpp
--- a/mlir/lib/Target/Cpp/TranslateRegistration.cpp
+++ b/mlir/lib/Target/Cpp/TranslateRegistration.cpp
@@ -15,7 +15,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/Target/Cpp/CppEmitter.h"
-#include "mlir/Translation.h"
+#include "mlir/Tools/mlir-translate/Translation.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt
--- a/mlir/lib/Target/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt
@@ -30,7 +30,7 @@
   MLIRDLTI
   MLIRLLVMIR
   MLIRLLVMIRTransforms
-  MLIRTranslation
+  MLIRTranslateLib
   )
 
 add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration
@@ -62,5 +62,5 @@
   LINK_LIBS PUBLIC
   MLIRDLTI
   MLIRLLVMIR
-  MLIRTranslation
+  MLIRTranslateLib
   )
diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
--- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
@@ -20,7 +20,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Target/LLVMIR/TypeFromLLVM.h"
-#include "mlir/Translation.h"
+#include "mlir/Tools/mlir-translate/Translation.h"
 
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/TypeSwitch.h"
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -14,7 +14,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Target/LLVMIR/Dialect/All.h"
 #include "mlir/Target/LLVMIR/Export.h"
-#include "mlir/Translation.h"
+#include "mlir/Tools/mlir-translate/Translation.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 
diff --git a/mlir/lib/Target/SPIRV/CMakeLists.txt b/mlir/lib/Target/SPIRV/CMakeLists.txt
--- a/mlir/lib/Target/SPIRV/CMakeLists.txt
+++ b/mlir/lib/Target/SPIRV/CMakeLists.txt
@@ -24,5 +24,5 @@
   MLIRSPIRVSerialization
   MLIRSPIRVDeserialization
   MLIRSupport
-  MLIRTranslation
+  MLIRTranslateLib
   )
diff --git a/mlir/lib/Target/SPIRV/Deserialization/CMakeLists.txt b/mlir/lib/Target/SPIRV/Deserialization/CMakeLists.txt
--- a/mlir/lib/Target/SPIRV/Deserialization/CMakeLists.txt
+++ b/mlir/lib/Target/SPIRV/Deserialization/CMakeLists.txt
@@ -11,7 +11,7 @@
   MLIRSPIRV
   MLIRSPIRVBinaryUtils
   MLIRSupport
-  MLIRTranslation
+  MLIRTranslateLib
   )
 
 
diff --git a/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt b/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt
--- a/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt
+++ b/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt
@@ -11,7 +11,7 @@
   MLIRSPIRV
   MLIRSPIRVBinaryUtils
   MLIRSupport
-  MLIRTranslation
+  MLIRTranslateLib
   )
 
 
diff --git a/mlir/lib/Target/SPIRV/TranslateRegistration.cpp b/mlir/lib/Target/SPIRV/TranslateRegistration.cpp
--- a/mlir/lib/Target/SPIRV/TranslateRegistration.cpp
+++ b/mlir/lib/Target/SPIRV/TranslateRegistration.cpp
@@ -17,11 +17,11 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/SPIRV/Deserialization.h"
 #include "mlir/Target/SPIRV/Serialization.h"
-#include "mlir/Translation.h"
+#include "mlir/Tools/mlir-translate/Translation.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
diff --git a/mlir/lib/Tools/CMakeLists.txt b/mlir/lib/Tools/CMakeLists.txt
--- a/mlir/lib/Tools/CMakeLists.txt
+++ b/mlir/lib/Tools/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_subdirectory(mlir-lsp-server)
+add_subdirectory(mlir-opt)
 add_subdirectory(mlir-reduce)
+add_subdirectory(mlir-translate)
 add_subdirectory(PDLL)
diff --git a/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp b/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
--- a/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
+++ b/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
@@ -13,7 +13,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Tools/PDLL/AST/Context.h"
 #include "mlir/Tools/PDLL/AST/Nodes.h"
 #include "mlir/Tools/PDLL/AST/Types.h"
diff --git a/mlir/lib/Tools/PDLL/Parser/Parser.cpp b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
--- a/mlir/lib/Tools/PDLL/Parser/Parser.cpp
+++ b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
@@ -43,8 +43,7 @@
 public:
   Parser(ast::Context &ctx, llvm::SourceMgr &sourceMgr)
       : ctx(ctx), lexer(sourceMgr, ctx.getDiagEngine()),
-        curToken(lexer.lexToken()), curDeclScope(nullptr),
-        valueTy(ast::ValueType::get(ctx)),
+        curToken(lexer.lexToken()), valueTy(ast::ValueType::get(ctx)),
         valueRangeTy(ast::ValueRangeType::get(ctx)),
         typeTy(ast::TypeType::get(ctx)),
         typeRangeTy(ast::TypeRangeType::get(ctx)),
@@ -469,7 +468,7 @@
   Token curToken;
 
   /// The most recently defined decl scope.
-  ast::DeclScope *curDeclScope;
+  ast::DeclScope *curDeclScope = nullptr;
   llvm::SpecificBumpPtrAllocator<ast::DeclScope> scopeAllocator;
 
   /// The current context of the parser.
diff --git a/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp b/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
--- a/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
+++ b/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
@@ -10,8 +10,8 @@
 #include "lsp/Logging.h"
 #include "lsp/Protocol.h"
 #include "mlir/IR/Operation.h"
-#include "mlir/Parser.h"
 #include "mlir/Parser/AsmParserState.h"
+#include "mlir/Parser/Parser.h"
 #include "llvm/Support/SourceMgr.h"
 
 using namespace mlir;
@@ -716,7 +716,7 @@
   int64_t version;
 
   /// The number of lines in the file.
-  int64_t totalNumLines;
+  int64_t totalNumLines = 0;
 
   /// The chunks of this file. The order of these chunks is the order in which
   /// they appear in the text file.
@@ -728,7 +728,7 @@
                            int64_t version, DialectRegistry &registry,
                            std::vector<lsp::Diagnostic> &diagnostics)
     : context(registry, MLIRContext::Threading::DISABLED),
-      contents(fileContents.str()), version(version), totalNumLines(0) {
+      contents(fileContents.str()), version(version) {
   context.allowUnregisteredDialects();
 
   // Split the file into separate MLIR documents.
diff --git a/mlir/lib/Tools/mlir-opt/CMakeLists.txt b/mlir/lib/Tools/mlir-opt/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Tools/mlir-opt/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_library(MLIROptLib
+  MlirOptMain.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Tools/mlir-opt
+
+  LINK_LIBS PUBLIC
+  MLIRPass
+  MLIRParser
+  MLIRSupport
+  )
diff --git a/mlir/lib/Support/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
rename from mlir/lib/Support/MlirOptMain.cpp
rename to mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
--- a/mlir/lib/Support/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Support/MlirOptMain.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -19,7 +19,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/DebugCounter.h"
@@ -59,7 +59,7 @@
 
   // Parse the input file and reset the context threading state.
   TimingScope parserTiming = timing.nest("Parser");
-  OwningOpRef<ModuleOp> module(parseSourceFile(sourceMgr, context));
+  OwningOpRef<ModuleOp> module(parseSourceFile<ModuleOp>(sourceMgr, context));
   context->enableMultithreading(wasThreadingEnabled);
   if (!module)
     return failure();
diff --git a/mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp b/mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp
--- a/mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp
+++ b/mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp
@@ -15,7 +15,7 @@
 
 #include "mlir/Tools/mlir-reduce/MlirReduceMain.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Reducer/Passes.h"
@@ -31,7 +31,7 @@
 static LogicalResult loadModule(MLIRContext &context,
                                 OwningOpRef<ModuleOp> &module,
                                 StringRef inputFilename) {
-  module = parseSourceFile(inputFilename, &context);
+  module = parseSourceFile<ModuleOp>(inputFilename, &context);
   if (!module)
     return failure();
 
diff --git a/mlir/lib/Tools/mlir-translate/CMakeLists.txt b/mlir/lib/Tools/mlir-translate/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Tools/mlir-translate/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_library(MLIRTranslateLib
+  MlirTranslateMain.cpp
+  Translation.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Tools/mlir-translate
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRParser
+  )
diff --git a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
@@ -0,0 +1,111 @@
+//===- MlirTranslateMain.cpp - MLIR Translation entry point ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Tools/mlir-translate/MlirTranslateMain.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/ToolUtilities.h"
+#include "mlir/Tools/mlir-translate/Translation.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Translation Parser
+//===----------------------------------------------------------------------===//
+
+LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
+                                      llvm::StringRef toolName) {
+
+  static llvm::cl::opt<std::string> inputFilename(
+      llvm::cl::Positional, llvm::cl::desc("<input file>"),
+      llvm::cl::init("-"));
+
+  static llvm::cl::opt<std::string> outputFilename(
+      "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
+      llvm::cl::init("-"));
+
+  static llvm::cl::opt<bool> allowUnregisteredDialects(
+      "allow-unregistered-dialect",
+      llvm::cl::desc("Allow operation with no registered dialects"),
+      llvm::cl::init(false));
+
+  static llvm::cl::opt<bool> splitInputFile(
+      "split-input-file",
+      llvm::cl::desc("Split the input file into pieces and "
+                     "process each chunk independently"),
+      llvm::cl::init(false));
+
+  static llvm::cl::opt<bool> verifyDiagnostics(
+      "verify-diagnostics",
+      llvm::cl::desc("Check that emitted diagnostics match "
+                     "expected-* lines on the corresponding line"),
+      llvm::cl::init(false));
+
+  llvm::InitLLVM y(argc, argv);
+
+  // Add flags for all the registered translations.
+  llvm::cl::opt<const TranslateFunction *, false, TranslationParser>
+      translationRequested("", llvm::cl::desc("Translation to perform"),
+                           llvm::cl::Required);
+  registerAsmPrinterCLOptions();
+  registerMLIRContextCLOptions();
+  llvm::cl::ParseCommandLineOptions(argc, argv, toolName);
+
+  std::string errorMessage;
+  auto input = openInputFile(inputFilename, &errorMessage);
+  if (!input) {
+    llvm::errs() << errorMessage << "\n";
+    return failure();
+  }
+
+  auto output = openOutputFile(outputFilename, &errorMessage);
+  if (!output) {
+    llvm::errs() << errorMessage << "\n";
+    return failure();
+  }
+
+  // Processes the memory buffer with a new MLIRContext.
+  auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
+                           raw_ostream &os) {
+    MLIRContext context;
+    context.allowUnregisteredDialects(allowUnregisteredDialects);
+    context.printOpOnDiagnostic(!verifyDiagnostics);
+    llvm::SourceMgr sourceMgr;
+    sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
+
+    if (!verifyDiagnostics) {
+      SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+      return (*translationRequested)(sourceMgr, os, &context);
+    }
+
+    // In the diagnostic verification flow, we ignore whether the translation
+    // failed (in most cases, it is expected to fail). Instead, we check if the
+    // diagnostics were produced as expected.
+    SourceMgrDiagnosticVerifierHandler sourceMgrHandler(sourceMgr, &context);
+    (void)(*translationRequested)(sourceMgr, os, &context);
+    return sourceMgrHandler.verify();
+  };
+
+  if (splitInputFile) {
+    if (failed(splitAndProcessBuffer(std::move(input), processBuffer,
+                                     output->os())))
+      return failure();
+  } else if (failed(processBuffer(std::move(input), output->os()))) {
+    return failure();
+  }
+
+  output->keep();
+  return success();
+}
diff --git a/mlir/lib/Translation/Translation.cpp b/mlir/lib/Tools/mlir-translate/Translation.cpp
rename from mlir/lib/Translation/Translation.cpp
rename to mlir/lib/Tools/mlir-translate/Translation.cpp
--- a/mlir/lib/Translation/Translation.cpp
+++ b/mlir/lib/Tools/mlir-translate/Translation.cpp
@@ -10,17 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Translation.h"
+#include "mlir/Tools/mlir-translate/Translation.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
-#include "mlir/Support/FileUtilities.h"
-#include "mlir/Support/ToolUtilities.h"
-#include "llvm/Support/InitLLVM.h"
+#include "mlir/Parser/Parser.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/ToolOutputFile.h"
 
 using namespace mlir;
 
@@ -101,7 +97,7 @@
     DialectRegistry registry;
     dialectRegistration(registry);
     context->appendDialectRegistry(registry);
-    auto module = OwningOpRef<ModuleOp>(parseSourceFile(sourceMgr, context));
+    auto module = parseSourceFile<ModuleOp>(sourceMgr, context);
     if (!module || failed(verify(*module)))
       return failure();
     return function(module.get(), output);
@@ -128,88 +124,3 @@
                        });
   llvm::cl::parser<const TranslateFunction *>::printOptionInfo(o, globalWidth);
 }
-
-LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
-                                      llvm::StringRef toolName) {
-
-  static llvm::cl::opt<std::string> inputFilename(
-      llvm::cl::Positional, llvm::cl::desc("<input file>"),
-      llvm::cl::init("-"));
-
-  static llvm::cl::opt<std::string> outputFilename(
-      "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
-      llvm::cl::init("-"));
-
-  static llvm::cl::opt<bool> allowUnregisteredDialects(
-      "allow-unregistered-dialect",
-      llvm::cl::desc("Allow operation with no registered dialects"),
-      llvm::cl::init(false));
-
-  static llvm::cl::opt<bool> splitInputFile(
-      "split-input-file",
-      llvm::cl::desc("Split the input file into pieces and "
-                     "process each chunk independently"),
-      llvm::cl::init(false));
-
-  static llvm::cl::opt<bool> verifyDiagnostics(
-      "verify-diagnostics",
-      llvm::cl::desc("Check that emitted diagnostics match "
-                     "expected-* lines on the corresponding line"),
-      llvm::cl::init(false));
-
-  llvm::InitLLVM y(argc, argv);
-
-  // Add flags for all the registered translations.
-  llvm::cl::opt<const TranslateFunction *, false, TranslationParser>
-      translationRequested("", llvm::cl::desc("Translation to perform"),
-                           llvm::cl::Required);
-  registerAsmPrinterCLOptions();
-  registerMLIRContextCLOptions();
-  llvm::cl::ParseCommandLineOptions(argc, argv, toolName);
-
-  std::string errorMessage;
-  auto input = openInputFile(inputFilename, &errorMessage);
-  if (!input) {
-    llvm::errs() << errorMessage << "\n";
-    return failure();
-  }
-
-  auto output = openOutputFile(outputFilename, &errorMessage);
-  if (!output) {
-    llvm::errs() << errorMessage << "\n";
-    return failure();
-  }
-
-  // Processes the memory buffer with a new MLIRContext.
-  auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
-                           raw_ostream &os) {
-    MLIRContext context;
-    context.allowUnregisteredDialects(allowUnregisteredDialects);
-    context.printOpOnDiagnostic(!verifyDiagnostics);
-    llvm::SourceMgr sourceMgr;
-    sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
-
-    if (!verifyDiagnostics) {
-      SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
-      return (*translationRequested)(sourceMgr, os, &context);
-    }
-
-    // In the diagnostic verification flow, we ignore whether the translation
-    // failed (in most cases, it is expected to fail). Instead, we check if the
-    // diagnostics were produced as expected.
-    SourceMgrDiagnosticVerifierHandler sourceMgrHandler(sourceMgr, &context);
-    (void)(*translationRequested)(sourceMgr, os, &context);
-    return sourceMgrHandler.verify();
-  };
-
-  if (splitInputFile) {
-    if (failed(splitAndProcessBuffer(std::move(input), processBuffer,
-                                     output->os())))
-      return failure();
-  } else if (failed(processBuffer(std::move(input), output->os()))) {
-    return failure();
-  }
-
-  output->keep();
-  return success();
-}
diff --git a/mlir/lib/Transforms/CSE.cpp b/mlir/lib/Transforms/CSE.cpp
--- a/mlir/lib/Transforms/CSE.cpp
+++ b/mlir/lib/Transforms/CSE.cpp
@@ -63,8 +63,7 @@
   /// Represents a single entry in the depth first traversal of a CFG.
   struct CFGStackNode {
     CFGStackNode(ScopedMapTy &knownValues, DominanceInfoNode *node)
-        : scope(knownValues), node(node), childIterator(node->begin()),
-          processed(false) {}
+        : scope(knownValues), node(node), childIterator(node->begin()) {}
 
     /// Scope for the known values.
     ScopedMapTy::ScopeTy scope;
@@ -73,7 +72,7 @@
     DominanceInfoNode::const_iterator childIterator;
 
     /// If this node has been fully processed yet or not.
-    bool processed;
+    bool processed = false;
   };
 
   /// Attempt to eliminate a redundant operation. Returns success if the
diff --git a/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp b/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp
--- a/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp
+++ b/mlir/lib/Transforms/Utils/ControlFlowSinkUtils.cpp
@@ -35,8 +35,7 @@
   /// Create an operation sinker with given dominance info.
   Sinker(function_ref<bool(Operation *, Region *)> shouldMoveIntoRegion,
          DominanceInfo &domInfo)
-      : shouldMoveIntoRegion(shouldMoveIntoRegion), domInfo(domInfo),
-        numSunk(0) {}
+      : shouldMoveIntoRegion(shouldMoveIntoRegion), domInfo(domInfo) {}
 
   /// Given a list of regions, find operations to sink and sink them. Return the
   /// number of operations sunk.
@@ -65,7 +64,7 @@
   /// Dominance info to determine op user dominance with respect to regions.
   DominanceInfo &domInfo;
   /// The number of operations sunk.
-  size_t numSunk;
+  size_t numSunk = 0;
 };
 } // end anonymous namespace
 
diff --git a/mlir/lib/Translation/CMakeLists.txt b/mlir/lib/Translation/CMakeLists.txt
deleted file mode 100644
--- a/mlir/lib/Translation/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-add_mlir_library(MLIRTranslation
-  Translation.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Translation
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRParser
-  )
diff --git a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir
--- a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir
+++ b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir
@@ -26,6 +26,16 @@
 
 // -----
 
+// CHECK-LABEL: func @relative_bounds
+func @relative_bounds(%arg: index) {
+  // CHECK: affine.for %{{.*}} = 0 to 4
+  affine.for %i = affine_map<(d0) -> (d0)>(%arg) to affine_map<(d0) -> (d0 + 4)>(%arg) {
+  }
+  return
+}
+
+// -----
+
 // Check that single iteration loop is removed and its body is promoted to the
 // parent block.
 
@@ -103,7 +113,7 @@
 // CHECK-DAG: [[$OUTERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32 + 2)>
 // CHECK-DAG: [[$INNERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)>
 // CHECK-DAG: [[$OUTERUB:#map[0-9]+]] = affine_map<()[s0] -> ((s0 - 2) ceildiv 32)>
-// CHECK-DAG: [[$INNERUB:#map[0-9]+]] = affine_map<(d0) -> (d0 - 2, 510)>
+// CHECK-DAG: [[$INNERUB:#map[0-9]+]] = affine_map<()[s0] -> (s0 - 2, 510)>
 
 // CHECK-LABEL: func @loop_with_multiple_upper_bounds
 // CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: index)
@@ -111,7 +121,7 @@
 // CHECK-NEXT:  %[[DIM:.*]] = memref.dim %arg0, %c0 : memref<?x?xf32>
 // CHECK-NEXT:   affine.for %[[I:.*]] = 0 to [[$OUTERUB]]()[%[[DIM]]] {
 // CHECK-NEXT:     %[[IIV:.*]] = affine.apply [[$OUTERIV]](%[[I]])
-// CHECK-NEXT:     affine.for %[[II:.*]] = 0 to min [[$INNERUB]](%[[ARG1]]) {
+// CHECK-NEXT:     affine.for %[[II:.*]] = 0 to min [[$INNERUB]]()[%[[ARG1]]] {
 // CHECK-NEXT:       %[[IIIV:.*]] = affine.apply [[$INNERIV]](%[[II]])
 // CHECK-NEXT:       "test.foo"(%[[IIV]], %[[IIIV]])
 // CHECK-NEXT:     }
@@ -133,7 +143,7 @@
 
 // CHECK-DAG: [[$INTERUB:#map[0-9]+]] = affine_map<()[s0] -> (s0 ceildiv 32)>
 // CHECK-DAG: [[$INTERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32)>
-// CHECK-DAG: [[$INTRAUB:#map[0-9]+]] = affine_map<(d0, d1)[s0] -> (32, -d0 + s0)>
+// CHECK-DAG: [[$INTRAUB:#map[0-9]+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
 // CHECK-DAG: [[$INTRAIV:#map[0-9]+]] = affine_map<(d0, d1) -> (d1 + d0)>
 
 // CHECK-LABEL: func @tiled_matmul
@@ -149,11 +159,11 @@
 // CHECK-NEXT:        %[[JIV:.*]] = affine.apply [[$INTERIV]](%[[J]])
 // CHECK-NEXT:        affine.for %[[K:.*]] = 0 to [[$INTERUB]]()[%[[DIM2]]] {
 // CHECK-NEXT:          %[[KIV:.*]] = affine.apply [[$INTERIV]](%[[K]])
-// CHECK-NEXT:          affine.for %[[II:.*]] = 0 to min [[$INTRAUB]](%[[IIV]], %[[IIV]])[%[[DIM0]]] {
+// CHECK-NEXT:          affine.for %[[II:.*]] = 0 to min [[$INTRAUB]](%[[IIV]])[%[[DIM0]]] {
 // CHECK-NEXT:            %[[IIIV:.*]] = affine.apply [[$INTRAIV]](%[[IIV]], %[[II]])
-// CHECK-NEXT:            affine.for %[[JJ:.*]] = 0 to min [[$INTRAUB]](%[[JIV]], %[[JIV]])[%[[DIM1]]] {
+// CHECK-NEXT:            affine.for %[[JJ:.*]] = 0 to min [[$INTRAUB]](%[[JIV]])[%[[DIM1]]] {
 // CHECK-NEXT:              %[[JJIV:.*]] = affine.apply [[$INTRAIV]](%[[JIV]], %[[JJ]])
-// CHECK-NEXT:              affine.for %[[KK:.*]] = 0 to min [[$INTRAUB]](%[[KIV]], %[[KIV]])[%[[DIM2]]] {
+// CHECK-NEXT:              affine.for %[[KK:.*]] = 0 to min [[$INTRAUB]](%[[KIV]])[%[[DIM2]]] {
 // CHECK-NEXT:                %[[KKIV:.*]] = affine.apply [[$INTRAIV]](%[[KIV]], %[[KK]])
 // CHECK-NEXT:                %{{.*}} = affine.load %[[ARG0]][%[[IIIV]], %[[KKIV]]] : memref<1024x1024xf32>
 // CHECK-NEXT:                %{{.*}} = affine.load %[[ARG1]][%[[KKIV]], %[[JJIV]]] : memref<1024x1024xf32>
diff --git a/mlir/test/IR/print-ir-invalid.mlir b/mlir/test/IR/print-ir-invalid.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/IR/print-ir-invalid.mlir
@@ -0,0 +1,33 @@
+// # RUN: mlir-opt -test-print-invalid %s | FileCheck %s
+// # RUN: mlir-opt -test-print-invalid %s --mlir-print-assume-verified  | FileCheck %s --check-prefix=ASSUME-VERIFIED
+
+// The pass creates some ops and prints them to stdout, the input is just an
+// empty module.
+module {}
+
+// The operation is invalid because the body does not have a terminator, print
+// the generic form.
+// CHECK:      Invalid operation:
+// CHECK-NEXT: "builtin.func"() ({
+// CHECK-NEXT: ^bb0:
+// CHECK-NEXT: })
+// CHECK-SAME: sym_name = "test"
+
+// The operation is valid because the body has a terminator, print the custom
+// form.
+// CHECK:      Valid operation:
+// CHECK-NEXT: func @test() {
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// With --mlir-print-assume-verified the custom form is printed in both cases.
+// This works in this particular case, but may crash in general.
+
+// ASSUME-VERIFIED:      Invalid operation:
+// ASSUME-VERIFIED-NEXT: func @test() {
+// ASSUME-VERIFIED-NEXT: }
+
+// ASSUME-VERIFIED:      Valid operation:
+// ASSUME-VERIFIED-NEXT: func @test() {
+// ASSUME-VERIFIED-NEXT:   return
+// ASSUME-VERIFIED-NEXT: }
diff --git a/mlir/test/Target/LLVMIR/openmp-nested.mlir b/mlir/test/Target/LLVMIR/openmp-nested.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-nested.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+ 
+module {
+  llvm.func @printf(!llvm.ptr<i8>, ...) -> i32
+  llvm.mlir.global internal constant @str0("WG size of kernel = %d X %d\0A\00")
+
+  llvm.func @main(%arg0: i32, %arg1: !llvm.ptr<ptr<i8>>) -> i32 {
+    omp.parallel   {
+      %0 = llvm.mlir.constant(1 : index) : i64
+      %1 = llvm.mlir.constant(10 : index) : i64
+      %2 = llvm.mlir.constant(0 : index) : i64
+      %4 = llvm.mlir.constant(0 : i32) : i32
+      %12 = llvm.alloca %0 x i64 : (i64) -> !llvm.ptr<i64>
+      omp.wsloop (%arg2) : i64 = (%2) to (%1) step (%0)  {
+        omp.parallel   {
+          omp.wsloop (%arg3) : i64 = (%2) to (%0) step (%0)  {
+            llvm.store %2, %12 : !llvm.ptr<i64>
+            omp.yield
+          }
+          omp.terminator
+        }
+        %19 = llvm.load %12 : !llvm.ptr<i64>
+        %20 = llvm.trunc %19 : i64 to i32
+        %5 = llvm.mlir.addressof @str0 : !llvm.ptr<array<29 x i8>>
+        %6 = llvm.getelementptr %5[%4, %4] : (!llvm.ptr<array<29 x i8>>, i32, i32) -> !llvm.ptr<i8>
+        %21 = llvm.call @printf(%6, %20, %20) : (!llvm.ptr<i8>, i32, i32) -> i32
+        omp.yield
+      }
+      omp.terminator
+    }
+    %a4 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %a4 : i32
+  }
+
+}
+
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @[[inner1:.+]] to void (i32*, i32*, ...)*))
+
+// CHECK: define internal void @[[inner1]]
+// CHECK: %[[structArg:.+]] = alloca { i64* }
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i64* }*)* @[[inner2:.+]] to void (i32*, i32*, ...)*), { i64* }* %[[structArg]])
diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
--- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
@@ -65,10 +65,7 @@
 
         auto resultType = op.result().getType().cast<ShapedType>();
         constexpr int64_t kConstantFoldingMaxNumElements = 1024;
-        if (resultType.getNumElements() > kConstantFoldingMaxNumElements)
-          return false;
-
-        return true;
+        return resultType.getNumElements() <= kConstantFoldingMaxNumElements;
       };
 
   tensor::populateFoldConstantExtractSlicePatterns(patterns, controlFn);
diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt
--- a/mlir/test/lib/IR/CMakeLists.txt
+++ b/mlir/test/lib/IR/CMakeLists.txt
@@ -9,6 +9,7 @@
   TestOpaqueLoc.cpp
   TestOperationEquals.cpp
   TestPrintDefUse.cpp
+  TestPrintInvalid.cpp
   TestPrintNesting.cpp
   TestSideEffects.cpp
   TestSlicing.cpp
diff --git a/mlir/test/lib/IR/TestPrintInvalid.cpp b/mlir/test/lib/IR/TestPrintInvalid.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/test/lib/IR/TestPrintInvalid.cpp
@@ -0,0 +1,52 @@
+//===- TestPrintInvalid.cpp - Test printing invalid ops -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass creates and prints to the standard output an invalid operation and
+// a valid operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace {
+struct TestPrintInvalidPass
+    : public PassWrapper<TestPrintInvalidPass, OperationPass<ModuleOp>> {
+  StringRef getArgument() const final { return "test-print-invalid"; }
+  StringRef getDescription() const final {
+    return "Test printing invalid ops.";
+  }
+  void getDependentDialects(DialectRegistry &registry) const {
+    registry.insert<func::FuncDialect>();
+  }
+
+  void runOnOperation() override {
+    Location loc = getOperation().getLoc();
+    OpBuilder builder(getOperation().body());
+    auto funcOp = builder.create<FuncOp>(
+        loc, "test", FunctionType::get(getOperation().getContext(), {}, {}));
+    funcOp.addEntryBlock();
+    // The created function is invalid because there is no return op.
+    llvm::outs() << "Invalid operation:\n" << funcOp << "\n";
+    builder.setInsertionPointToEnd(&funcOp.getBody().front());
+    builder.create<func::ReturnOp>(loc);
+    // Now this function is valid.
+    llvm::outs() << "Valid operation:\n" << funcOp << "\n";
+    funcOp.erase();
+  }
+};
+} // namespace
+
+namespace mlir {
+void registerTestPrintInvalidPass() {
+  PassRegistration<TestPrintInvalidPass>{};
+}
+} // namespace mlir
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -16,7 +16,7 @@
 
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/Optional.h"
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -18,7 +18,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/FileUtilities.h"
-#include "mlir/Support/MlirOptMain.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/SourceMgr.h"
@@ -45,6 +45,7 @@
 void registerTestMatchers();
 void registerTestOperationEqualPass();
 void registerTestPrintDefUsePass();
+void registerTestPrintInvalidPass();
 void registerTestPrintNestingPass();
 void registerTestReducer();
 void registerTestSpirvEntryPointABIPass();
@@ -132,6 +133,7 @@
   registerTestMatchers();
   registerTestOperationEqualPass();
   registerTestPrintDefUsePass();
+  registerTestPrintInvalidPass();
   registerTestPrintNestingPass();
   registerTestReducer();
   registerTestSpirvEntryPointABIPass();
diff --git a/mlir/tools/mlir-spirv-cpu-runner/CMakeLists.txt b/mlir/tools/mlir-spirv-cpu-runner/CMakeLists.txt
--- a/mlir/tools/mlir-spirv-cpu-runner/CMakeLists.txt
+++ b/mlir/tools/mlir-spirv-cpu-runner/CMakeLists.txt
@@ -29,7 +29,7 @@
     MLIRSPIRV
     MLIRTargetLLVMIRExport
     MLIRTransforms
-    MLIRTranslation
+    MLIRTranslateLib
     MLIRSupport
   )
 endif()
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -301,8 +301,8 @@
   };
 
   OperationFormat(const Operator &op)
-      : allOperands(false), allOperandTypes(false), allResultTypes(false),
-        infersResultTypes(false) {
+
+  {
     operandTypes.resize(op.getNumOperands(), TypeResolution());
     resultTypes.resize(op.getNumResults(), TypeResolution());
 
@@ -346,10 +346,10 @@
 
   /// A flag indicating if all operand/result types were seen. If the format
   /// contains these, it can not contain individual type resolvers.
-  bool allOperands, allOperandTypes, allResultTypes;
+  bool allOperands = false, allOperandTypes = false, allResultTypes = false;
 
   /// A flag indicating if this operation infers its result types
-  bool infersResultTypes;
+  bool infersResultTypes = false;
 
   /// A flag indicating if this operation has the SingleBlockImplicitTerminator
   /// trait.
@@ -2851,7 +2851,7 @@
     if (failed(lelement))
       return failure();
     literalElements.push_back(*lelement);
-    parsingElements.push_back(std::vector<FormatElement *>());
+    parsingElements.emplace_back();
     std::vector<FormatElement *> &currParsingElements = parsingElements.back();
     while (peekToken().getKind() != FormatToken::pipe &&
            peekToken().getKind() != FormatToken::r_paren) {
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -243,7 +243,7 @@
   StaticMatcherHelper &staticMatcherHelper;
 
   // The next unused ID for newly created values.
-  unsigned nextValueId;
+  unsigned nextValueId = 0;
 
   raw_indented_ostream os;
 
@@ -333,8 +333,7 @@
 PatternEmitter::PatternEmitter(Record *pat, RecordOperatorMap *mapper,
                                raw_ostream &os, StaticMatcherHelper &helper)
     : loc(pat->getLoc()), opMap(mapper), pattern(pat, mapper),
-      symbolInfoMap(pat->getLoc()), staticMatcherHelper(helper), nextValueId(0),
-      os(os) {
+      symbolInfoMap(pat->getLoc()), staticMatcherHelper(helper), os(os) {
   fmtCtx.withBuilder("rewriter");
 }
 
diff --git a/mlir/tools/mlir-translate/CMakeLists.txt b/mlir/tools/mlir-translate/CMakeLists.txt
--- a/mlir/tools/mlir-translate/CMakeLists.txt
+++ b/mlir/tools/mlir-translate/CMakeLists.txt
@@ -18,7 +18,7 @@
   MLIRParser
   MLIRPass
   MLIRSPIRV
-  MLIRTranslation
+  MLIRTranslateLib
   MLIRSupport
   )
 
diff --git a/mlir/tools/mlir-translate/mlir-translate.cpp b/mlir/tools/mlir-translate/mlir-translate.cpp
--- a/mlir/tools/mlir-translate/mlir-translate.cpp
+++ b/mlir/tools/mlir-translate/mlir-translate.cpp
@@ -13,7 +13,7 @@
 
 #include "mlir/InitAllTranslations.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Translation.h"
+#include "mlir/Tools/mlir-translate/MlirTranslateMain.h"
 
 using namespace mlir;
 
diff --git a/mlir/tools/mlir-vulkan-runner/CMakeLists.txt b/mlir/tools/mlir-vulkan-runner/CMakeLists.txt
--- a/mlir/tools/mlir-vulkan-runner/CMakeLists.txt
+++ b/mlir/tools/mlir-vulkan-runner/CMakeLists.txt
@@ -73,7 +73,7 @@
     MLIRSupport
     MLIRTargetLLVMIRExport
     MLIRTransforms
-    MLIRTranslation
+    MLIRTranslateLib
     ${Vulkan_LIBRARY}
   )
 
diff --git a/mlir/unittests/Dialect/Affine/Analysis/AffineStructuresParser.cpp b/mlir/unittests/Dialect/Affine/Analysis/AffineStructuresParser.cpp
--- a/mlir/unittests/Dialect/Affine/Analysis/AffineStructuresParser.cpp
+++ b/mlir/unittests/Dialect/Affine/Analysis/AffineStructuresParser.cpp
@@ -8,7 +8,7 @@
 
 #include "./AffineStructuresParser.h"
 #include "mlir/IR/IntegerSet.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 
 using namespace mlir;
 
diff --git a/mlir/unittests/ExecutionEngine/Invoke.cpp b/mlir/unittests/ExecutionEngine/Invoke.cpp
--- a/mlir/unittests/ExecutionEngine/Invoke.cpp
+++ b/mlir/unittests/ExecutionEngine/Invoke.cpp
@@ -20,7 +20,7 @@
 #include "mlir/ExecutionEngine/RunnerUtils.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/InitAllDialects.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
diff --git a/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp b/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
--- a/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
+++ b/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
@@ -12,7 +12,7 @@
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 
 #include <gtest/gtest.h>
 
diff --git a/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp b/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
--- a/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
+++ b/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
@@ -14,7 +14,7 @@
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 
 #include <gtest/gtest.h>
 
diff --git a/mlir/unittests/Interfaces/InferTypeOpInterfaceTest.cpp b/mlir/unittests/Interfaces/InferTypeOpInterfaceTest.cpp
--- a/mlir/unittests/Interfaces/InferTypeOpInterfaceTest.cpp
+++ b/mlir/unittests/Interfaces/InferTypeOpInterfaceTest.cpp
@@ -16,7 +16,7 @@
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 
 #include <gtest/gtest.h>
 
diff --git a/mlir/unittests/Transforms/Canonicalizer.cpp b/mlir/unittests/Transforms/Canonicalizer.cpp
--- a/mlir/unittests/Transforms/Canonicalizer.cpp
+++ b/mlir/unittests/Transforms/Canonicalizer.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Parser.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1483,11 +1483,9 @@
     includes = ["include"],
     deps = [
         ":ArmSVEIncGen",
-        ":FuncDialect",
         ":IR",
         ":LLVMDialect",
         ":SideEffectInterfaces",
-        ":VectorOps",
         "//llvm:Core",
         "//llvm:Support",
     ],
@@ -1504,7 +1502,6 @@
         ":IR",
         ":LLVMCommonConversion",
         ":LLVMDialect",
-        ":Pass",
         ":TransformUtils",
         "//llvm:Core",
         "//llvm:Support",
@@ -1792,7 +1789,6 @@
     deps = [
         ":Affine",
         ":AffineAnalysis",
-        ":Analysis",
         ":ArithmeticDialect",
         ":BufferizationDialect",
         ":BufferizationTransforms",
@@ -1937,7 +1933,6 @@
         ":ArithmeticDialect",
         ":IR",
         ":LinalgOps",
-        ":SparseTensor",
         "//llvm:Support",
     ],
 )
@@ -1967,7 +1962,6 @@
         ":Pass",
         ":SCFDialect",
         ":SCFTransforms",
-        ":SCFUtils",
         ":SparseTensor",
         ":SparseTensorPassIncGen",
         ":SparseTensorUtils",
@@ -2103,7 +2097,6 @@
         ":LoopLikeInterface",
         ":MemRefDialect",
         ":SideEffectInterfaces",
-        ":Support",
         ":TensorDialect",
         "//llvm:Support",
     ],
@@ -2119,7 +2112,6 @@
     ]),
     includes = ["include"],
     deps = [
-        ":Dialect",
         ":EmitCAttributesIncGen",
         ":EmitCOpsIncGen",
         ":IR",
@@ -2140,12 +2132,9 @@
     deps = [
         ":AsyncOpsIncGen",
         ":ControlFlowInterfaces",
-        ":Dialect",
-        ":FuncDialect",
         ":IR",
         ":InferTypeOpInterface",
         ":SideEffectInterfaces",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -2713,7 +2702,10 @@
         "lib/Dialect/Shape/Transforms/*.cpp",
         "lib/Dialect/Shape/Transforms/*.h",
     ]),
-    hdrs = ["include/mlir/Dialect/Shape/Transforms/Passes.h"],
+    hdrs = [
+        "include/mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h",
+        "include/mlir/Dialect/Shape/Transforms/Passes.h",
+    ],
     includes = ["include"],
     deps = [
         ":ArithmeticDialect",
@@ -2793,13 +2785,10 @@
     includes = ["include"],
     deps = [
         ":ArithmeticDialect",
-        ":CallOpInterfaces",
-        ":CastOpInterfaces",
         ":CommonFolders",
         ":ControlFlowInterfaces",
         ":ControlFlowOpsIncGen",
         ":IR",
-        ":InferTypeOpInterface",
         ":SideEffectInterfaces",
         ":Support",
         "//llvm:Support",
@@ -2822,7 +2811,6 @@
     includes = ["include"],
     deps = [
         ":ArithmeticDialect",
-        ":ArithmeticUtils",
         ":CallOpInterfaces",
         ":CastOpInterfaces",
         ":CommonFolders",
@@ -2833,7 +2821,6 @@
         ":InferTypeOpInterface",
         ":SideEffectInterfaces",
         ":Support",
-        ":VectorInterfaces",
         "//llvm:Support",
     ],
 )
@@ -2878,19 +2865,14 @@
     hdrs = glob(["include/mlir/Dialect/Func/Transforms/*.h"]),
     includes = ["include"],
     deps = [
-        ":Affine",
-        ":ArithmeticDialect",
-        ":ArithmeticTransforms",
         ":BufferizationDialect",
         ":BufferizationTransforms",
         ":FuncDialect",
         ":FuncTransformsPassIncGen",
         ":IR",
-        ":MemRefDialect",  # TODO: Remove dependency on MemRef dialect
         ":Pass",
         ":SCFDialect",
         ":Support",
-        ":TensorDialect",
         ":Transforms",
         "//llvm:Support",
     ],
@@ -2908,8 +2890,6 @@
     ]),
     includes = ["include"],
     deps = [
-        ":Affine",
-        ":AffineAnalysis",
         ":ArithmeticDialect",
         ":ArithmeticUtils",
         ":DialectUtils",
@@ -2956,19 +2936,15 @@
     includes = ["include"],
     deps = [
         ":Affine",
-        ":AffineAnalysis",
-        ":Analysis",
         ":ArithmeticDialect",
         ":BufferizationDialect",
         ":BufferizationTransforms",
         ":DialectUtils",
-        ":FuncDialect",
         ":IR",
         ":LinalgOps",
         ":MemRefDialect",
         ":Pass",
         ":SCFDialect",
-        ":Support",
         ":TensorDialect",
         ":Transforms",
         ":VectorInterfaces",
@@ -2994,13 +2970,11 @@
         ":Affine",
         ":AffineAnalysis",
         ":ArithmeticDialect",
-        ":DialectUtils",
         ":FuncDialect",
         ":IR",
         ":MemRefDialect",
         ":Support",
         ":TensorDialect",
-        ":VectorInterfaces",
         ":VectorOps",
         "//llvm:Support",
     ],
@@ -3008,20 +2982,11 @@
 
 cc_library(
     name = "Support",
-    srcs = glob(
-        [
-            "lib/Support/*.cpp",
-            "lib/Support/*.h",
-        ],
-        exclude = [
-            # TODO(jpienaar): Move this out, else Support depends on Analysis/
-            "lib/Support/MlirOptMain.cpp",
-        ],
-    ),
-    hdrs = glob(
-        ["include/mlir/Support/*.h"],
-        exclude = ["include/mlir/Support/MlirOptMain.h"],
-    ),
+    srcs = glob([
+        "lib/Support/*.cpp",
+        "lib/Support/*.h",
+    ]),
+    hdrs = glob(["include/mlir/Support/*.h"]),
     includes = ["include"],
     deps = ["//llvm:Support"],
 )
@@ -3064,9 +3029,7 @@
     ]),
     hdrs = glob([
         "include/mlir/Parser/*.h",
-    ]) + [
-        "include/mlir/Parser.h",
-    ],
+    ]),
     includes = ["include"],
     deps = [
         ":IR",
@@ -3794,12 +3757,10 @@
     hdrs = ["include/mlir/Dialect/LLVMIR/NVVMDialect.h"],
     includes = ["include"],
     deps = [
-        ":FuncDialect",
         ":IR",
         ":LLVMDialect",
         ":NVVMOpsIncGen",
         ":SideEffectInterfaces",
-        ":Support",
         "//llvm:AsmParser",
         "//llvm:Core",
         "//llvm:Support",
@@ -3891,12 +3852,10 @@
     hdrs = ["include/mlir/Dialect/LLVMIR/ROCDLDialect.h"],
     includes = ["include"],
     deps = [
-        ":FuncDialect",
         ":IR",
         ":LLVMDialect",
         ":ROCDLOpsIncGen",
         ":SideEffectInterfaces",
-        ":Support",
         "//llvm:AsmParser",
         "//llvm:Core",
         "//llvm:Support",
@@ -3976,7 +3935,6 @@
         ":PDLOpsIncGen",
         ":PDLTypesIncGen",
         ":SideEffects",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -4054,7 +4012,6 @@
         ":PDLDialect",
         ":PDLInterpOpsIncGen",
         ":SideEffects",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -4263,7 +4220,6 @@
         ":IR",
         ":InferTypeOpInterface",
         ":Parser",
-        ":Pass",
         ":SPIRVAttrUtilsGen",
         ":SPIRVAvailabilityIncGen",
         ":SPIRVCanonicalizationIncGen",
@@ -4305,7 +4261,6 @@
     includes = ["include"],
     deps = [
         ":SPIRVDialect",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -4317,7 +4272,6 @@
     includes = ["include"],
     deps = [
         ":SPIRVDialect",
-        ":Support",
         ":TransformUtils",
         "//llvm:Support",
     ],
@@ -4344,7 +4298,6 @@
         ":SPIRVDialect",
         ":SPIRVPassIncGen",
         ":SPIRVUtils",
-        ":Support",
         ":Transforms",
         "//llvm:Support",
     ],
@@ -4355,9 +4308,7 @@
     hdrs = ["lib/Conversion/SPIRVCommon/Pattern.h"],
     includes = ["include"],
     deps = [
-        ":IR",
         ":SPIRVDialect",
-        ":Support",
         ":Transforms",
     ],
 )
@@ -4461,7 +4412,6 @@
     hdrs = ["include/mlir/Target/SPIRV/SPIRVBinaryUtils.h"],
     includes = ["include"],
     deps = [
-        ":IR",
         ":SPIRVAttrUtilsGen",
         ":SPIRVDialect",
         ":SPIRVOpsIncGen",
@@ -4614,7 +4564,6 @@
         ":IR",
         ":InferTypeOpInterface",
         ":SideEffectInterfaces",
-        ":Support",
         ":TensorOpsIncGen",
         ":TilingInterface",
         ":ViewLikeInterface",
@@ -4662,8 +4611,6 @@
     deps = [
         ":Affine",
         ":ArithmeticDialect",
-        ":IR",
-        ":Support",
         ":TensorDialect",
         "//llvm:Support",
     ],
@@ -4702,7 +4649,6 @@
     includes = ["include"],
     deps = [
         ":ArithmeticDialect",
-        ":Async",
         ":BufferizationDialect",
         ":BufferizationTransforms",
         ":DialectUtils",
@@ -4712,7 +4658,6 @@
         ":ParallelLoopMapperAttrGen",
         ":Pass",
         ":SCFDialect",
-        ":Support",
         ":TensorDialect",
         ":TensorPassIncGen",
         ":Transforms",
@@ -4790,7 +4735,6 @@
     deps = [
         ":DerivedAttributeOpInterfaceIncGen",
         ":IR",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -4951,7 +4895,6 @@
     deps = [
         ":Analysis",
         ":ControlFlowInterfaces",
-        ":CopyOpInterface",
         ":IR",
         ":LoopLikeInterface",
         ":Pass",
@@ -5360,7 +5303,6 @@
     deps = [
         ":CallOpInterfacesIncGen",
         ":IR",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -5391,7 +5333,6 @@
     deps = [
         ":CastOpInterfacesIncGen",
         ":IR",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -5422,7 +5363,6 @@
     deps = [
         ":ControlFlowInterfacesIncGen",
         ":IR",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -5484,7 +5424,6 @@
     deps = [
         ":IR",
         ":SideEffectInterfacesIncGen",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -5548,11 +5487,8 @@
 
 cc_library(
     name = "Translation",
-    srcs = glob([
-        "lib/Translation/*.cpp",
-        "lib/Translation/*.h",
-    ]),
-    hdrs = ["include/mlir/Translation.h"],
+    srcs = glob(["lib/Tools/mlir-translate/*.cpp"]),
+    hdrs = glob(["include/mlir/Tools/mlir-translate/*.h"]),
     includes = ["include"],
     deps = [
         ":IR",
@@ -5601,7 +5537,6 @@
         ":AMX",
         ":AMXConversionIncGen",
         ":IR",
-        ":Support",
         ":ToLLVMIRTranslation",
         "//llvm:Core",
         "//llvm:Support",
@@ -5615,7 +5550,6 @@
     includes = ["include"],
     deps = [
         ":IR",
-        ":Support",
         ":ToLLVMIRTranslation",
         ":X86Vector",
         ":X86VectorConversionIncGen",
@@ -5634,7 +5568,6 @@
         ":ArmNeonConversionIncGen",
         ":ArmNeonIncGen",
         ":IR",
-        ":Support",
         ":ToLLVMIRTranslation",
         "//llvm:Core",
         "//llvm:Support",
@@ -5650,7 +5583,6 @@
         ":ArmSVE",
         ":ArmSVEConversionIncGen",
         ":IR",
-        ":Support",
         ":ToLLVMIRTranslation",
         "//llvm:Core",
         "//llvm:Support",
@@ -5666,7 +5598,6 @@
         ":IR",
         ":NVVMConversionIncGen",
         ":NVVMDialect",
-        ":Support",
         ":ToLLVMIRTranslation",
         "//llvm:Core",
         "//llvm:Support",
@@ -5682,7 +5613,6 @@
         ":IR",
         ":ROCDLConversionIncGen",
         ":ROCDLDialect",
-        ":Support",
         ":ToLLVMIRTranslation",
         "//llvm:Core",
         "//llvm:Support",
@@ -5812,9 +5742,7 @@
         ":LLVMDialect",
         ":Support",
         ":ToLLVMIRTranslation",
-        ":Translation",
         "//llvm:AllTargetsAsmParsers",
-        "//llvm:BitReader",
         "//llvm:BitWriter",
         "//llvm:Core",
         "//llvm:ExecutionEngine",
@@ -5846,8 +5774,8 @@
 
 cc_library(
     name = "MlirOptLib",
-    srcs = ["lib/Support/MlirOptMain.cpp"],
-    hdrs = ["include/mlir/Support/MlirOptMain.h"],
+    srcs = ["lib/Tools/mlir-opt/MlirOptMain.cpp"],
+    hdrs = ["include/mlir/Tools/mlir-opt/MlirOptMain.h"],
     includes = ["include"],
     deps = [
         ":IR",
@@ -5875,8 +5803,6 @@
     deps = [
         ":AllPassesAndDialects",
         ":AllTranslations",
-        ":IR",
-        ":Parser",
         ":Support",
         ":Translation",
         "//llvm:Support",
@@ -6067,7 +5993,6 @@
         ":OpenACCToLLVMIRTranslation",
         ":OpenMPToLLVMIRTranslation",
         ":Parser",
-        ":Pass",
         ":SCFToStandard",
         ":Support",
         "//llvm:Core",
@@ -6237,7 +6162,6 @@
         ":GPUDialect",
         ":GPUToSPIRV",
         ":GPUTransforms",
-        ":IR",
         ":LLVMDialect",
         ":LLVMToLLVMIRTranslation",
         ":MemRefDialect",
@@ -6272,7 +6196,6 @@
     srcs = ["tools/mlir-tblgen/mlir-tblgen.cpp"],
     includes = ["include"],
     deps = [
-        ":Support",
         ":TableGen",
         "//llvm:Support",
         "//llvm:TableGen",
@@ -7129,7 +7052,6 @@
         ":FuncDialect",
         ":FuncTransforms",
         ":IR",
-        ":InferTypeOpInterface",
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
@@ -7169,11 +7091,9 @@
     deps = [
         ":BufferizationDialect",
         ":BufferizationTransforms",
-        ":DialectUtils",
         ":FuncDialect",
         ":IR",
         ":MemRefDialect",
-        ":Support",
         "//llvm:Support",
     ],
 )
@@ -7620,8 +7540,6 @@
         ":IR",
         ":InferTypeOpInterface",
         ":SideEffectInterfaces",
-        ":Support",
-        ":VectorInterfaces",
         "//llvm:Support",
     ],
 )
@@ -7788,7 +7706,6 @@
         ":IR",
         ":InferTypeOpInterface",
         ":SideEffectInterfaces",
-        ":Support",
         ":VectorInterfaces",
         "//llvm:Support",
     ],
@@ -7831,7 +7748,6 @@
         ":IR",
         ":MemRefDialect",
         ":Pass",
-        ":Support",
         ":TransformUtils",
         ":Transforms",
         "//llvm:Support",
@@ -7928,7 +7844,6 @@
         ":MathBaseIncGen",
         ":MathOpsIncGen",
         ":SideEffectInterfaces",
-        ":Support",
         ":VectorInterfaces",
         "//llvm:Support",
     ],
@@ -7948,9 +7863,6 @@
         ":FuncDialect",
         ":IR",
         ":MathDialect",
-        ":Pass",
-        ":SCFDialect",
-        ":Support",
         ":Transforms",
         ":VectorOps",
         ":VectorUtils",
@@ -8115,7 +8027,6 @@
         ":MemRefDialect",
         ":MemRefPassIncGen",
         ":Pass",
-        ":Support",
         ":TensorDialect",
         ":Transforms",
         ":VectorOps",
@@ -8236,15 +8147,11 @@
         ":BufferizableOpInterfaceIncGen",
         ":BufferizationBaseIncGen",
         ":BufferizationOpsIncGen",
-        ":ControlFlowInterfaces",
-        ":CopyOpInterface",
         ":FuncDialect",
         ":IR",
-        ":InferTypeOpInterface",
         ":MemRefDialect",
         ":Support",
         ":TensorDialect",
-        ":ViewLikeInterface",
         "//llvm:Support",
     ],
 )
@@ -8283,14 +8190,11 @@
         ":BufferizationDialect",
         ":BufferizationPassIncGen",
         ":ControlFlowInterfaces",
-        ":DialectUtils",
         ":FuncDialect",
         ":IR",
-        ":InferTypeOpInterface",
         ":LoopLikeInterface",
         ":MemRefDialect",
         ":Pass",
-        ":Support",
         ":Transforms",
         "//llvm:Support",
     ],
@@ -8428,7 +8332,6 @@
         ":AllPassesAndDialects",
         ":IR",
         ":MlirReduceLib",
-        ":Pass",
         "//llvm:Support",
         "//mlir/test:TestDialect",
     ],