diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -228,9 +228,6 @@
     "Whether clang should use a new process for the CC1 invocation")
 
 option(CLANG_DEFAULT_PIE_ON_LINUX "Default to -fPIE and -pie on Linux" OFF)
-if(CLANG_DEFAULT_PIE_ON_LINUX)
-  set(CLANG_DEFAULT_PIE_ON_LINUX 1)
-endif()
 
 # TODO: verify the values against LangStandards.def?
 set(CLANG_DEFAULT_STD_C "" CACHE STRING
diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -82,6 +82,10 @@
                                  .clang-format file located in one of the parent
                                  directories of the source file (or current
                                  directory for stdin).
+                                 Use -style=file:<format_file_path> to load style
+                                 configuration from a format file located at
+                                 <format_file_path>. This path can be absolute or
+                                 relative to the working directory.
                                  Use -style="{key: value, ...}" to set specific
                                  parameters, e.g.:
                                    -style="{BasedOnStyle: llvm, IndentWidth: 8}"
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -32,6 +32,10 @@
 of the input file. When the standard input is used, the search is started from
 the current directory.
 
+When using ``-style=file:<format_file_path>``, :program:`clang-format` for
+each input file will use the format file located at `<format_file_path>`.
+The path may be absolute or relative to the working directory.
+
 The ``.clang-format`` file uses YAML format:
 
 .. code-block:: yaml
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -298,6 +298,10 @@
   space before parentheses. The custom options can be set using
   ``SpaceBeforeParensOptions``.
 
+- The command line argument `-style=<string>` has been extended so that a specific
+  format file at location <format_file_path> can be selected. This is supported
+  via the syntax: `-style=file:<format_file_path>`.
+
 - Improved C++20 Modules and Coroutines support.
 
 libclang
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CLANG_AST_ABSTRACTBASICREADER_H
-#define CLANG_AST_ABSTRACTBASICREADER_H
+#ifndef LLVM_CLANG_AST_ABSTRACTBASICREADER_H
+#define LLVM_CLANG_AST_ABSTRACTBASICREADER_H
 
 #include "clang/AST/DeclTemplate.h"
 
diff --git a/clang/include/clang/AST/AbstractBasicWriter.h b/clang/include/clang/AST/AbstractBasicWriter.h
--- a/clang/include/clang/AST/AbstractBasicWriter.h
+++ b/clang/include/clang/AST/AbstractBasicWriter.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CLANG_AST_ABSTRACTBASICWRITER_H
-#define CLANG_AST_ABSTRACTBASICWRITER_H
+#ifndef LLVM_CLANG_AST_ABSTRACTBASICWRITER_H
+#define LLVM_CLANG_AST_ABSTRACTBASICWRITER_H
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclTemplate.h"
diff --git a/clang/include/clang/AST/AbstractTypeReader.h b/clang/include/clang/AST/AbstractTypeReader.h
--- a/clang/include/clang/AST/AbstractTypeReader.h
+++ b/clang/include/clang/AST/AbstractTypeReader.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CLANG_AST_ABSTRACTTYPEREADER_H
-#define CLANG_AST_ABSTRACTTYPEREADER_H
+#ifndef LLVM_CLANG_AST_ABSTRACTTYPEREADER_H
+#define LLVM_CLANG_AST_ABSTRACTTYPEREADER_H
 
 #include "clang/AST/Type.h"
 #include "clang/AST/AbstractBasicReader.h"
diff --git a/clang/include/clang/AST/AbstractTypeWriter.h b/clang/include/clang/AST/AbstractTypeWriter.h
--- a/clang/include/clang/AST/AbstractTypeWriter.h
+++ b/clang/include/clang/AST/AbstractTypeWriter.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CLANG_AST_ABSTRACTTYPEWRITER_H
-#define CLANG_AST_ABSTRACTTYPEWRITER_H
+#ifndef LLVM_CLANG_AST_ABSTRACTTYPEWRITER_H
+#define LLVM_CLANG_AST_ABSTRACTTYPEWRITER_H
 
 #include "clang/AST/Type.h"
 #include "clang/AST/AbstractBasicWriter.h"
diff --git a/clang/include/clang/AST/ComputeDependence.h b/clang/include/clang/AST/ComputeDependence.h
--- a/clang/include/clang/AST/ComputeDependence.h
+++ b/clang/include/clang/AST/ComputeDependence.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_AST_COMPUTE_DEPENDENCE_H
-#define LLVM_CLANG_AST_COMPUTE_DEPENDENCE_H
+#ifndef LLVM_CLANG_AST_COMPUTEDEPENDENCE_H
+#define LLVM_CLANG_AST_COMPUTEDEPENDENCE_H
 
 #include "clang/AST/DependenceFlags.h"
 #include "clang/Basic/ExceptionSpecificationType.h"
diff --git a/clang/include/clang/AST/CurrentSourceLocExprScope.h b/clang/include/clang/AST/CurrentSourceLocExprScope.h
--- a/clang/include/clang/AST/CurrentSourceLocExprScope.h
+++ b/clang/include/clang/AST/CurrentSourceLocExprScope.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_AST_CURRENT_SOURCE_LOC_EXPR_SCOPE_H
-#define LLVM_CLANG_AST_CURRENT_SOURCE_LOC_EXPR_SCOPE_H
+#ifndef LLVM_CLANG_AST_CURRENTSOURCELOCEXPRSCOPE_H
+#define LLVM_CLANG_AST_CURRENTSOURCELOCEXPRSCOPE_H
 
 #include <cassert>
 
@@ -71,4 +71,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_AST_CURRENT_SOURCE_LOC_EXPR_SCOPE_H
+#endif // LLVM_CLANG_AST_CURRENTSOURCELOCEXPRSCOPE_H
diff --git a/clang/include/clang/AST/DeclObjCCommon.h b/clang/include/clang/AST/DeclObjCCommon.h
--- a/clang/include/clang/AST/DeclObjCCommon.h
+++ b/clang/include/clang/AST/DeclObjCCommon.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_AST_DECLOBJC_COMMON_H
-#define LLVM_CLANG_AST_DECLOBJC_COMMON_H
+#ifndef LLVM_CLANG_AST_DECLOBJCCOMMON_H
+#define LLVM_CLANG_AST_DECLOBJCCOMMON_H
 
 namespace clang {
 
@@ -52,4 +52,4 @@
 
 } // namespace clang
 
-#endif // LLVM_CLANG_AST_DECLOBJC_COMMON_H
+#endif // LLVM_CLANG_AST_DECLOBJCCOMMON_H
diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h
--- a/clang/include/clang/AST/FormatString.h
+++ b/clang/include/clang/AST/FormatString.h
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_ANALYSIS_ANALYSES_FORMATSTRING_H
-#define LLVM_CLANG_ANALYSIS_ANALYSES_FORMATSTRING_H
+#ifndef LLVM_CLANG_AST_FORMATSTRING_H
+#define LLVM_CLANG_AST_FORMATSTRING_H
 
 #include "clang/AST/CanonicalType.h"
 
diff --git a/clang/include/clang/AST/LexicallyOrderedRecursiveASTVisitor.h b/clang/include/clang/AST/LexicallyOrderedRecursiveASTVisitor.h
--- a/clang/include/clang/AST/LexicallyOrderedRecursiveASTVisitor.h
+++ b/clang/include/clang/AST/LexicallyOrderedRecursiveASTVisitor.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_AST_LEXICALLY_ORDERED_RECURSIVEASTVISITOR_H
-#define LLVM_CLANG_AST_LEXICALLY_ORDERED_RECURSIVEASTVISITOR_H
+#ifndef LLVM_CLANG_AST_LEXICALLYORDEREDRECURSIVEASTVISITOR_H
+#define LLVM_CLANG_AST_LEXICALLYORDEREDRECURSIVEASTVISITOR_H
 
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/LLVM.h"
@@ -160,4 +160,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_AST_LEXICALLY_ORDERED_RECURSIVEASTVISITOR_H
+#endif // LLVM_CLANG_AST_LEXICALLYORDEREDRECURSIVEASTVISITOR_H
diff --git a/clang/include/clang/AST/LocInfoType.h b/clang/include/clang/AST/LocInfoType.h
--- a/clang/include/clang/AST/LocInfoType.h
+++ b/clang/include/clang/AST/LocInfoType.h
@@ -10,8 +10,8 @@
 // source-location information.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_SEMA_LOCINFOTYPE_H
-#define LLVM_CLANG_SEMA_LOCINFOTYPE_H
+#ifndef LLVM_CLANG_AST_LOCINFOTYPE_H
+#define LLVM_CLANG_AST_LOCINFOTYPE_H
 
 #include "clang/AST/Type.h"
 
@@ -54,4 +54,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_SEMA_LOCINFOTYPE_H
+#endif // LLVM_CLANG_AST_LOCINFOTYPE_H
diff --git a/clang/include/clang/AST/NonTrivialTypeVisitor.h b/clang/include/clang/AST/NonTrivialTypeVisitor.h
--- a/clang/include/clang/AST/NonTrivialTypeVisitor.h
+++ b/clang/include/clang/AST/NonTrivialTypeVisitor.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_NON_TRIVIAL_TYPE_VISITOR_H
-#define LLVM_CLANG_NON_TRIVIAL_TYPE_VISITOR_H
+#ifndef LLVM_CLANG_AST_NONTRIVIALTYPEVISITOR_H
+#define LLVM_CLANG_AST_NONTRIVIALTYPEVISITOR_H
 
 #include "clang/AST/Type.h"
 
diff --git a/clang/include/clang/AST/OSLog.h b/clang/include/clang/AST/OSLog.h
--- a/clang/include/clang/AST/OSLog.h
+++ b/clang/include/clang/AST/OSLog.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_ANALYSIS_ANALYSES_OSLOG_H
-#define LLVM_CLANG_ANALYSIS_ANALYSES_OSLOG_H
+#ifndef LLVM_CLANG_AST_OSLOG_H
+#define LLVM_CLANG_AST_OSLOG_H
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Expr.h"
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -32,6 +32,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Frontend/OpenMP/OMPAssume.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPContext.h"
 #include "llvm/Support/Casting.h"
diff --git a/clang/include/clang/AST/QualTypeNames.h b/clang/include/clang/AST/QualTypeNames.h
--- a/clang/include/clang/AST/QualTypeNames.h
+++ b/clang/include/clang/AST/QualTypeNames.h
@@ -89,4 +89,4 @@
                                bool WithGlobalNsPrefix = false);
 } // end namespace TypeName
 } // end namespace clang
-#endif // LLVM_CLANG_TOOLING_CORE_QUALTYPENAMES_H
+#endif // LLVM_CLANG_AST_QUALTYPENAMES_H
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -3725,10 +3725,9 @@
 /// \endcode
 AST_MATCHER_P(ObjCMessageExpr, hasSelector, std::string, BaseName) {
   Selector Sel = Node.getSelector();
-  return BaseName.compare(Sel.getAsString()) == 0;
+  return BaseName == Sel.getAsString();
 }
 
-
 /// Matches when at least one of the supplied string equals to the
 /// Selector.getAsString()
 ///
diff --git a/clang/include/clang/ASTMatchers/Dynamic/Diagnostics.h b/clang/include/clang/ASTMatchers/Dynamic/Diagnostics.h
--- a/clang/include/clang/ASTMatchers/Dynamic/Diagnostics.h
+++ b/clang/include/clang/ASTMatchers/Dynamic/Diagnostics.h
@@ -186,4 +186,4 @@
 }  // namespace ast_matchers
 }  // namespace clang
 
-#endif  // LLVM_CLANG_AST_MATCHERS_DYNAMIC_DIAGNOSTICS_H
+#endif // LLVM_CLANG_ASTMATCHERS_DYNAMIC_DIAGNOSTICS_H
diff --git a/clang/include/clang/ASTMatchers/Dynamic/Parser.h b/clang/include/clang/ASTMatchers/Dynamic/Parser.h
--- a/clang/include/clang/ASTMatchers/Dynamic/Parser.h
+++ b/clang/include/clang/ASTMatchers/Dynamic/Parser.h
@@ -280,4 +280,4 @@
 } // namespace ast_matchers
 } // namespace clang
 
-#endif // LLVM_CLANG_AST_MATCHERS_DYNAMIC_PARSER_H
+#endif // LLVM_CLANG_ASTMATCHERS_DYNAMIC_PARSER_H
diff --git a/clang/include/clang/ASTMatchers/Dynamic/Registry.h b/clang/include/clang/ASTMatchers/Dynamic/Registry.h
--- a/clang/include/clang/ASTMatchers/Dynamic/Registry.h
+++ b/clang/include/clang/ASTMatchers/Dynamic/Registry.h
@@ -157,4 +157,4 @@
 } // namespace ast_matchers
 } // namespace clang
 
-#endif // LLVM_CLANG_AST_MATCHERS_DYNAMIC_REGISTRY_H
+#endif // LLVM_CLANG_ASTMATCHERS_DYNAMIC_REGISTRY_H
diff --git a/clang/include/clang/ASTMatchers/Dynamic/VariantValue.h b/clang/include/clang/ASTMatchers/Dynamic/VariantValue.h
--- a/clang/include/clang/ASTMatchers/Dynamic/VariantValue.h
+++ b/clang/include/clang/ASTMatchers/Dynamic/VariantValue.h
@@ -356,4 +356,4 @@
 } // end namespace ast_matchers
 } // end namespace clang
 
-#endif  // LLVM_CLANG_AST_MATCHERS_DYNAMIC_VARIANT_VALUE_H
+#endif // LLVM_CLANG_ASTMATCHERS_DYNAMIC_VARIANTVALUE_H
diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
--- a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
+++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
@@ -517,4 +517,4 @@
 } // namespace threadSafety
 } // namespace clang
 
-#endif // LLVM_CLANG_THREAD_SAFETY_COMMON_H
+#endif // LLVM_CLANG_ANALYSIS_ANALYSES_THREADSAFETYCOMMON_H
diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyUtil.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyUtil.h
--- a/clang/include/clang/Analysis/Analyses/ThreadSafetyUtil.h
+++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyUtil.h
@@ -354,4 +354,4 @@
 } // namespace threadSafety
 } // namespace clang
 
-#endif // LLVM_CLANG_THREAD_SAFETY_UTIL_H
+#endif // LLVM_CLANG_ANALYSIS_ANALYSES_THREADSAFETYUTIL_H
diff --git a/clang/include/clang/Analysis/AnyCall.h b/clang/include/clang/Analysis/AnyCall.h
--- a/clang/include/clang/Analysis/AnyCall.h
+++ b/clang/include/clang/Analysis/AnyCall.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-#ifndef LLVM_CLANG_ANALYSIS_ANY_CALL_H
-#define LLVM_CLANG_ANALYSIS_ANY_CALL_H
+#ifndef LLVM_CLANG_ANALYSIS_ANYCALL_H
+#define LLVM_CLANG_ANALYSIS_ANYCALL_H
 
 #include "clang/AST/Decl.h"
 #include "clang/AST/ExprCXX.h"
@@ -215,4 +215,4 @@
 
 }
 
-#endif // LLVM_CLANG_ANALYSIS_ANY_CALL_H
+#endif // LLVM_CLANG_ANALYSIS_ANYCALL_H
diff --git a/clang/include/clang/Analysis/BodyFarm.h b/clang/include/clang/Analysis/BodyFarm.h
--- a/clang/include/clang/Analysis/BodyFarm.h
+++ b/clang/include/clang/Analysis/BodyFarm.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LIB_ANALYSIS_BODYFARM_H
-#define LLVM_CLANG_LIB_ANALYSIS_BODYFARM_H
+#ifndef LLVM_CLANG_ANALYSIS_BODYFARM_H
+#define LLVM_CLANG_ANALYSIS_BODYFARM_H
 
 #include "clang/AST/DeclBase.h"
 #include "clang/Basic/LLVM.h"
diff --git a/clang/include/clang/Analysis/CloneDetection.h b/clang/include/clang/Analysis/CloneDetection.h
--- a/clang/include/clang/Analysis/CloneDetection.h
+++ b/clang/include/clang/Analysis/CloneDetection.h
@@ -11,8 +11,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
-#define LLVM_CLANG_AST_CLONEDETECTION_H
+#ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
+#define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
 
 #include "clang/AST/StmtVisitor.h"
 #include "llvm/Support/Regex.h"
@@ -441,4 +441,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_AST_CLONEDETECTION_H
+#endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
diff --git a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
new file mode 100644
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
@@ -0,0 +1,57 @@
+//===-- ControlFlowContext.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines a ControlFlowContext class that is used by dataflow
+//  analyses that run over Control-Flow Graphs (CFGs).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CONTROLFLOWCONTEXT_H
+#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CONTROLFLOWCONTEXT_H
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/Stmt.h"
+#include "clang/Analysis/CFG.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Error.h"
+#include <memory>
+#include <utility>
+
+namespace clang {
+namespace dataflow {
+
+/// Holds CFG and other derived context that is needed to perform dataflow
+/// analysis.
+class ControlFlowContext {
+public:
+  /// Builds a ControlFlowContext from an AST node.
+  static llvm::Expected<ControlFlowContext> build(const Decl *D, Stmt *S,
+                                                  ASTContext *C);
+
+  /// Returns the CFG that is stored in this context.
+  const CFG &getCFG() const { return *Cfg; }
+
+  /// Returns a mapping from statements to basic blocks that contain them.
+  const llvm::DenseMap<const Stmt *, const CFGBlock *> &getStmtToBlock() const {
+    return StmtToBlock;
+  }
+
+private:
+  ControlFlowContext(std::unique_ptr<CFG> Cfg,
+                     llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock)
+      : Cfg(std::move(Cfg)), StmtToBlock(std::move(StmtToBlock)) {}
+
+  std::unique_ptr<CFG> Cfg;
+  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+};
+
+} // namespace dataflow
+} // namespace clang
+
+#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CONTROLFLOWCONTEXT_H
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -21,6 +21,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h"
 #include "llvm/ADT/Any.h"
@@ -101,17 +102,12 @@
 /// Performs dataflow analysis and returns a mapping from basic block IDs to
 /// dataflow analysis states that model the respective basic blocks. Indices
 /// of the returned vector correspond to basic block IDs.
-///
-/// Requirements:
-///
-///  `Cfg` must have been built with `CFG::BuildOptions::setAllAlwaysAdd()` to
-///  ensure that all sub-expressions in a basic block are evaluated.
 template <typename AnalysisT>
 std::vector<llvm::Optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>
-runDataflowAnalysis(const CFG &Cfg, AnalysisT &Analysis,
+runDataflowAnalysis(const ControlFlowContext &CFCtx, AnalysisT &Analysis,
                     const Environment &InitEnv) {
   auto TypeErasedBlockStates =
-      runTypeErasedDataflowAnalysis(Cfg, Analysis, InitEnv);
+      runTypeErasedDataflowAnalysis(CFCtx, Analysis, InitEnv);
   std::vector<
       llvm::Optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>
       BlockStates;
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowWorklist.h b/clang/include/clang/Analysis/FlowSensitive/DataflowWorklist.h
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowWorklist.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowWorklist.h
@@ -92,4 +92,4 @@
 
 } // namespace clang
 
-#endif // LLVM_CLANG_ANALYSIS_ANALYSES_CONSUMED_H
+#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_DATAFLOWWORKLIST_H
diff --git a/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
--- a/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
@@ -19,6 +19,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
 #include "llvm/ADT/Any.h"
@@ -87,6 +88,7 @@
 ///   already been transferred. States in `BlockStates` that are set to
 ///   `llvm::None` represent basic blocks that are not evaluated yet.
 TypeErasedDataflowAnalysisState transferBlock(
+    const ControlFlowContext &CFCtx,
     std::vector<llvm::Optional<TypeErasedDataflowAnalysisState>> &BlockStates,
     const CFGBlock &Block, const Environment &InitEnv,
     TypeErasedDataflowAnalysis &Analysis,
@@ -97,13 +99,8 @@
 /// Performs dataflow analysis and returns a mapping from basic block IDs to
 /// dataflow analysis states that model the respective basic blocks. Indices
 /// of the returned vector correspond to basic block IDs.
-///
-/// Requirements:
-///
-///  `Cfg` must have been built with `CFG::BuildOptions::setAllAlwaysAdd()` to
-///  ensure that all sub-expressions in a basic block are evaluated.
 std::vector<llvm::Optional<TypeErasedDataflowAnalysisState>>
-runTypeErasedDataflowAnalysis(const CFG &Cfg,
+runTypeErasedDataflowAnalysis(const ControlFlowContext &CFCtx,
                               TypeErasedDataflowAnalysis &Analysis,
                               const Environment &InitEnv);
 
diff --git a/clang/include/clang/Analysis/IssueHash.h b/clang/include/clang/Analysis/IssueHash.h
--- a/clang/include/clang/Analysis/IssueHash.h
+++ b/clang/include/clang/Analysis/IssueHash.h
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_STATICANALYZER_CORE_ISSUE_HASH_H
-#define LLVM_CLANG_STATICANALYZER_CORE_ISSUE_HASH_H
+#ifndef LLVM_CLANG_ANALYSIS_ISSUEHASH_H
+#define LLVM_CLANG_ANALYSIS_ISSUEHASH_H
 
 #include "llvm/ADT/SmallString.h"
 
diff --git a/clang/include/clang/Analysis/PathDiagnostic.h b/clang/include/clang/Analysis/PathDiagnostic.h
--- a/clang/include/clang/Analysis/PathDiagnostic.h
+++ b/clang/include/clang/Analysis/PathDiagnostic.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_STATICANALYZER_CORE_BUGREPORTER_PATHDIAGNOSTIC_H
-#define LLVM_CLANG_STATICANALYZER_CORE_BUGREPORTER_PATHDIAGNOSTIC_H
+#ifndef LLVM_CLANG_ANALYSIS_PATHDIAGNOSTIC_H
+#define LLVM_CLANG_ANALYSIS_PATHDIAGNOSTIC_H
 
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
@@ -905,4 +905,4 @@
 } // namespace ento
 } // namespace clang
 
-#endif // LLVM_CLANG_STATICANALYZER_CORE_BUGREPORTER_PATHDIAGNOSTIC_H
+#endif // LLVM_CLANG_ANALYSIS_PATHDIAGNOSTIC_H
diff --git a/clang/include/clang/Analysis/RetainSummaryManager.h b/clang/include/clang/Analysis/RetainSummaryManager.h
--- a/clang/include/clang/Analysis/RetainSummaryManager.h
+++ b/clang/include/clang/Analysis/RetainSummaryManager.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_ANALYSIS_RETAINSUMMARY_MANAGER_H
-#define LLVM_CLANG_ANALYSIS_RETAINSUMMARY_MANAGER_H
+#ifndef LLVM_CLANG_ANALYSIS_RETAINSUMMARYMANAGER_H
+#define LLVM_CLANG_ANALYSIS_RETAINSUMMARYMANAGER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
diff --git a/clang/include/clang/Analysis/SelectorExtras.h b/clang/include/clang/Analysis/SelectorExtras.h
--- a/clang/include/clang/Analysis/SelectorExtras.h
+++ b/clang/include/clang/Analysis/SelectorExtras.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LIB_ANALYSIS_SELECTOREXTRAS_H
-#define LLVM_CLANG_LIB_ANALYSIS_SELECTOREXTRAS_H
+#ifndef LLVM_CLANG_ANALYSIS_SELECTOREXTRAS_H
+#define LLVM_CLANG_ANALYSIS_SELECTOREXTRAS_H
 
 #include "clang/AST/ASTContext.h"
 
diff --git a/clang/include/clang/Basic/AlignedAllocation.h b/clang/include/clang/Basic/AlignedAllocation.h
--- a/clang/include/clang/Basic/AlignedAllocation.h
+++ b/clang/include/clang/Basic/AlignedAllocation.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_BASIC_ALIGNED_ALLOCATION_H
-#define LLVM_CLANG_BASIC_ALIGNED_ALLOCATION_H
+#ifndef LLVM_CLANG_BASIC_ALIGNEDALLOCATION_H
+#define LLVM_CLANG_BASIC_ALIGNEDALLOCATION_H
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -42,4 +42,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_BASIC_ALIGNED_ALLOCATION_H
+#endif // LLVM_CLANG_BASIC_ALIGNEDALLOCATION_H
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -5987,7 +5987,7 @@
 def DiagnoseAsBuiltinDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
-The ``diagnose_as_builtin` attribute indicates that Fortify diagnostics are to
+The ``diagnose_as_builtin`` attribute indicates that Fortify diagnostics are to
 be applied to the declared function as if it were the function specified by the
 attribute. The builtin function whose diagnostics are to be mimicked should be
 given. In addition, the order in which arguments should be applied must also
@@ -5995,12 +5995,12 @@
 
 For example, the attribute can be used as follows.
 
-  .. code-block:: c
+.. code-block:: c
 
-    __attribute__((diagnose_as_builtin(__builtin_memset, 3, 2, 1)))
-    void *mymemset(int n, int c, void *s) {
-      // ...
-    }
+  __attribute__((diagnose_as_builtin(__builtin_memset, 3, 2, 1)))
+  void *mymemset(int n, int c, void *s) {
+    // ...
+  }
 
 This indicates that calls to ``mymemset`` should be diagnosed as if they were
 calls to ``__builtin_memset``. The arguments ``3, 2, 1`` indicate by index the
@@ -6015,7 +6015,8 @@
 to diagnose a new function as if it were `sscanf`, we can use the attribute as
 follows.
 
-  .. code-block:: c
+.. code-block:: c
+
   __attribute__((diagnose_as_builtin(sscanf, 1, 2)))
   int mysscanf(const char *str, const char *format, ...)  {
     // ...
diff --git a/clang/include/clang/Basic/AttrSubjectMatchRules.h b/clang/include/clang/Basic/AttrSubjectMatchRules.h
--- a/clang/include/clang/Basic/AttrSubjectMatchRules.h
+++ b/clang/include/clang/Basic/AttrSubjectMatchRules.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_BASIC_ATTR_SUBJECT_MATCH_RULES_H
-#define LLVM_CLANG_BASIC_ATTR_SUBJECT_MATCH_RULES_H
+#ifndef LLVM_CLANG_BASIC_ATTRSUBJECTMATCHRULES_H
+#define LLVM_CLANG_BASIC_ATTRSUBJECTMATCHRULES_H
 
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/clang/include/clang/Basic/DarwinSDKInfo.h b/clang/include/clang/Basic/DarwinSDKInfo.h
--- a/clang/include/clang/Basic/DarwinSDKInfo.h
+++ b/clang/include/clang/Basic/DarwinSDKInfo.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_BASIC_DARWIN_SDK_INFO_H
-#define LLVM_CLANG_BASIC_DARWIN_SDK_INFO_H
+#ifndef LLVM_CLANG_BASIC_DARWINSDKINFO_H
+#define LLVM_CLANG_BASIC_DARWINSDKINFO_H
 
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
@@ -154,4 +154,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_BASIC_DARWIN_SDK_INFO_H
+#endif // LLVM_CLANG_BASIC_DARWINSDKINFO_H
diff --git a/clang/include/clang/Basic/DiagnosticError.h b/clang/include/clang/Basic/DiagnosticError.h
--- a/clang/include/clang/Basic/DiagnosticError.h
+++ b/clang/include/clang/Basic/DiagnosticError.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_BASIC_DIAGNOSTIC_ERROR_H
-#define LLVM_CLANG_BASIC_DIAGNOSTIC_ERROR_H
+#ifndef LLVM_CLANG_BASIC_DIAGNOSTICERROR_H
+#define LLVM_CLANG_BASIC_DIAGNOSTICERROR_H
 
 #include "clang/Basic/PartialDiagnostic.h"
 #include "llvm/Support/Error.h"
@@ -57,4 +57,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_BASIC_DIAGNOSTIC_ERROR_H
+#endif // LLVM_CLANG_BASIC_DIAGNOSTICERROR_H
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5785,7 +5785,7 @@
 def ext_typecheck_zero_array_size : Extension<
   "zero size arrays are an extension">, InGroup<ZeroLengthArray>;
 def err_typecheck_zero_array_size : Error<
-  "zero-length arrays are not permitted in C++">;
+  "zero-length arrays are not permitted in %select{C++|SYCL device code}0">;
 def err_array_size_non_int : Error<"size of array has non-integer type %0">;
 def err_init_element_not_constant : Error<
   "initializer element is not a compile-time constant">;
diff --git a/clang/include/clang/Basic/OperatorPrecedence.h b/clang/include/clang/Basic/OperatorPrecedence.h
--- a/clang/include/clang/Basic/OperatorPrecedence.h
+++ b/clang/include/clang/Basic/OperatorPrecedence.h
@@ -49,4 +49,4 @@
 
 }  // end namespace clang
 
-#endif  // LLVM_CLANG_OPERATOR_PRECEDENCE_H
+#endif // LLVM_CLANG_BASIC_OPERATORPRECEDENCE_H
diff --git a/clang/include/clang/Basic/PragmaKinds.h b/clang/include/clang/Basic/PragmaKinds.h
--- a/clang/include/clang/Basic/PragmaKinds.h
+++ b/clang/include/clang/Basic/PragmaKinds.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_BASIC_PRAGMA_KINDS_H
-#define LLVM_CLANG_BASIC_PRAGMA_KINDS_H
+#ifndef LLVM_CLANG_BASIC_PRAGMAKINDS_H
+#define LLVM_CLANG_BASIC_PRAGMAKINDS_H
 
 namespace clang {
 
diff --git a/clang/include/clang/Basic/ProfileList.h b/clang/include/clang/Basic/ProfileList.h
--- a/clang/include/clang/Basic/ProfileList.h
+++ b/clang/include/clang/Basic/ProfileList.h
@@ -10,8 +10,8 @@
 // functions.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_BASIC_INSTRPROFLIST_H
-#define LLVM_CLANG_BASIC_INSTRPROFLIST_H
+#ifndef LLVM_CLANG_BASIC_PROFILELIST_H
+#define LLVM_CLANG_BASIC_PROFILELIST_H
 
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/LLVM.h"
diff --git a/clang/include/clang/Basic/TargetID.h b/clang/include/clang/Basic/TargetID.h
--- a/clang/include/clang/Basic/TargetID.h
+++ b/clang/include/clang/Basic/TargetID.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_BASIC_TARGET_ID_H
-#define LLVM_CLANG_BASIC_TARGET_ID_H
+#ifndef LLVM_CLANG_BASIC_TARGETID_H
+#define LLVM_CLANG_BASIC_TARGETID_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -21,7 +21,7 @@
 /// postfixed by a plus or minus sign delimited by colons, e.g.
 /// gfx908:xnack+:sramecc-. Each processor have a limited
 /// number of predefined features when showing up in a target ID.
-const llvm::SmallVector<llvm::StringRef, 4>
+llvm::SmallVector<llvm::StringRef, 4>
 getAllPossibleTargetIDFeatures(const llvm::Triple &T,
                                llvm::StringRef Processor);
 
diff --git a/clang/include/clang/CodeGen/ObjectFilePCHContainerOperations.h b/clang/include/clang/CodeGen/ObjectFilePCHContainerOperations.h
--- a/clang/include/clang/CodeGen/ObjectFilePCHContainerOperations.h
+++ b/clang/include/clang/CodeGen/ObjectFilePCHContainerOperations.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_CODEGEN_OBJECT_FILE_PCH_CONTAINER_OPERATIONS_H
-#define LLVM_CLANG_CODEGEN_OBJECT_FILE_PCH_CONTAINER_OPERATIONS_H
+#ifndef LLVM_CLANG_CODEGEN_OBJECTFILEPCHCONTAINEROPERATIONS_H
+#define LLVM_CLANG_CODEGEN_OBJECTFILEPCHCONTAINEROPERATIONS_H
 
 #include "clang/Frontend/PCHContainerOperations.h"
 
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4066,6 +4066,8 @@
 /// * "file" - Load style configuration from a file called ``.clang-format``
 /// located in one of the parent directories of ``FileName`` or the current
 /// directory if ``FileName`` is empty.
+/// * "file:<format_file_path>" to explicitly specify the configuration file to
+/// use.
 ///
 /// \param[in] StyleName Style name to interpret according to the description
 /// above.
diff --git a/clang/include/clang/Frontend/PCHContainerOperations.h b/clang/include/clang/Frontend/PCHContainerOperations.h
--- a/clang/include/clang/Frontend/PCHContainerOperations.h
+++ b/clang/include/clang/Frontend/PCHContainerOperations.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_PCH_CONTAINER_OPERATIONS_H
-#define LLVM_CLANG_PCH_CONTAINER_OPERATIONS_H
+#ifndef LLVM_CLANG_FRONTEND_PCHCONTAINEROPERATIONS_H
+#define LLVM_CLANG_FRONTEND_PCHCONTAINEROPERATIONS_H
 
 #include "clang/Serialization/PCHContainerOperations.h"
 
diff --git a/clang/include/clang/Frontend/PrecompiledPreamble.h b/clang/include/clang/Frontend/PrecompiledPreamble.h
--- a/clang/include/clang/Frontend/PrecompiledPreamble.h
+++ b/clang/include/clang/Frontend/PrecompiledPreamble.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_FRONTEND_PRECOMPILED_PREAMBLE_H
-#define LLVM_CLANG_FRONTEND_PRECOMPILED_PREAMBLE_H
+#ifndef LLVM_CLANG_FRONTEND_PRECOMPILEDPREAMBLE_H
+#define LLVM_CLANG_FRONTEND_PRECOMPILEDPREAMBLE_H
 
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
diff --git a/clang/include/clang/Frontend/SerializedDiagnostics.h b/clang/include/clang/Frontend/SerializedDiagnostics.h
--- a/clang/include/clang/Frontend/SerializedDiagnostics.h
+++ b/clang/include/clang/Frontend/SerializedDiagnostics.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_FRONTEND_SERIALIZE_DIAGNOSTICS_H_
-#define LLVM_CLANG_FRONTEND_SERIALIZE_DIAGNOSTICS_H_
+#ifndef LLVM_CLANG_FRONTEND_SERIALIZEDDIAGNOSTICS_H
+#define LLVM_CLANG_FRONTEND_SERIALIZEDDIAGNOSTICS_H
 
 #include "llvm/Bitstream/BitCodes.h"
 
diff --git a/clang/include/clang/IndexSerialization/SerializablePathCollection.h b/clang/include/clang/IndexSerialization/SerializablePathCollection.h
--- a/clang/include/clang/IndexSerialization/SerializablePathCollection.h
+++ b/clang/include/clang/IndexSerialization/SerializablePathCollection.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_INDEX_SerializablePathCollection_H
-#define LLVM_CLANG_INDEX_SerializablePathCollection_H
+#ifndef LLVM_CLANG_INDEXSERIALIZATION_SERIALIZABLEPATHCOLLECTION_H
+#define LLVM_CLANG_INDEXSERIALIZATION_SERIALIZABLEPATHCOLLECTION_H
 
 #include "clang/Basic/FileManager.h"
 #include "llvm/ADT/APInt.h"
@@ -126,4 +126,4 @@
 } // namespace index
 } // namespace clang
 
-#endif // LLVM_CLANG_INDEX_SerializablePathCollection_H
+#endif // LLVM_CLANG_INDEXSERIALIZATION_SERIALIZABLEPATHCOLLECTION_H
diff --git a/clang/include/clang/Lex/DependencyDirectivesSourceMinimizer.h b/clang/include/clang/Lex/DependencyDirectivesSourceMinimizer.h
--- a/clang/include/clang/Lex/DependencyDirectivesSourceMinimizer.h
+++ b/clang/include/clang/Lex/DependencyDirectivesSourceMinimizer.h
@@ -14,8 +14,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H
-#define LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H
+#ifndef LLVM_CLANG_LEX_DEPENDENCYDIRECTIVESSOURCEMINIMIZER_H
+#define LLVM_CLANG_LEX_DEPENDENCYDIRECTIVESSOURCEMINIMIZER_H
 
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -112,4 +112,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H
+#endif // LLVM_CLANG_LEX_DEPENDENCYDIRECTIVESSOURCEMINIMIZER_H
diff --git a/clang/include/clang/Lex/PreprocessorExcludedConditionalDirectiveSkipMapping.h b/clang/include/clang/Lex/PreprocessorExcludedConditionalDirectiveSkipMapping.h
--- a/clang/include/clang/Lex/PreprocessorExcludedConditionalDirectiveSkipMapping.h
+++ b/clang/include/clang/Lex/PreprocessorExcludedConditionalDirectiveSkipMapping.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LEX_PREPROCESSOR_EXCLUDED_COND_DIRECTIVE_SKIP_MAPPING_H
-#define LLVM_CLANG_LEX_PREPROCESSOR_EXCLUDED_COND_DIRECTIVE_SKIP_MAPPING_H
+#ifndef LLVM_CLANG_LEX_PREPROCESSOREXCLUDEDCONDITIONALDIRECTIVESKIPMAPPING_H
+#define LLVM_CLANG_LEX_PREPROCESSOREXCLUDEDCONDITIONALDIRECTIVESKIPMAPPING_H
 
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
@@ -27,4 +27,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_LEX_PREPROCESSOR_EXCLUDED_COND_DIRECTIVE_SKIP_MAPPING_H
+#endif // LLVM_CLANG_LEX_PREPROCESSOREXCLUDEDCONDITIONALDIRECTIVESKIPMAPPING_H
diff --git a/clang/include/clang/Parse/RAIIObjectsForParser.h b/clang/include/clang/Parse/RAIIObjectsForParser.h
--- a/clang/include/clang/Parse/RAIIObjectsForParser.h
+++ b/clang/include/clang/Parse/RAIIObjectsForParser.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LIB_PARSE_RAIIOBJECTSFORPARSER_H
-#define LLVM_CLANG_LIB_PARSE_RAIIOBJECTSFORPARSER_H
+#ifndef LLVM_CLANG_PARSE_RAIIOBJECTSFORPARSER_H
+#define LLVM_CLANG_PARSE_RAIIOBJECTSFORPARSER_H
 
 #include "clang/Parse/ParseDiagnostic.h"
 #include "clang/Parse/Parser.h"
diff --git a/clang/include/clang/Sema/CleanupInfo.h b/clang/include/clang/Sema/CleanupInfo.h
--- a/clang/include/clang/Sema/CleanupInfo.h
+++ b/clang/include/clang/Sema/CleanupInfo.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_SEMA_CLEANUP_INFO_H
-#define LLVM_CLANG_SEMA_CLEANUP_INFO_H
+#ifndef LLVM_CLANG_SEMA_CLEANUPINFO_H
+#define LLVM_CLANG_SEMA_CLEANUPINFO_H
 
 namespace clang {
 
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_SEMA_ATTRIBUTELIST_H
-#define LLVM_CLANG_SEMA_ATTRIBUTELIST_H
+#ifndef LLVM_CLANG_SEMA_PARSEDATTR_H
+#define LLVM_CLANG_SEMA_PARSEDATTR_H
 
 #include "clang/Basic/AttrSubjectMatchRules.h"
 #include "clang/Basic/AttributeCommonInfo.h"
@@ -1159,4 +1159,4 @@
 
 } // namespace clang
 
-#endif // LLVM_CLANG_SEMA_ATTRIBUTELIST_H
+#endif // LLVM_CLANG_SEMA_PARSEDATTR_H
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13142,6 +13142,9 @@
   /// Adds Callee to DeviceCallGraph if we don't know if its caller will be
   /// codegen'ed yet.
   bool checkSYCLDeviceFunction(SourceLocation Loc, FunctionDecl *Callee);
+  void deepTypeCheckForSYCLDevice(SourceLocation UsedAt,
+                                  llvm::DenseSet<QualType> Visited,
+                                  ValueDecl *DeclToCheck);
 };
 
 /// RAII object that enters a new expression evaluation context.
diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h
--- a/clang/include/clang/Sema/SemaConcept.h
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -152,4 +152,4 @@
 
 } // clang
 
-#endif //LLVM_CLANG_SEMA_SEMACONCEPT_H
+#endif // LLVM_CLANG_SEMA_SEMACONCEPT_H
diff --git a/clang/include/clang/Sema/TemplateInstCallback.h b/clang/include/clang/Sema/TemplateInstCallback.h
--- a/clang/include/clang/Sema/TemplateInstCallback.h
+++ b/clang/include/clang/Sema/TemplateInstCallback.h
@@ -11,8 +11,8 @@
 //
 //===---------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TEMPLATE_INST_CALLBACK_H
-#define LLVM_CLANG_TEMPLATE_INST_CALLBACK_H
+#ifndef LLVM_CLANG_SEMA_TEMPLATEINSTCALLBACK_H
+#define LLVM_CLANG_SEMA_TEMPLATEINSTCALLBACK_H
 
 #include "clang/Sema/Sema.h"
 
diff --git a/clang/include/clang/Serialization/ModuleFileExtension.h b/clang/include/clang/Serialization/ModuleFileExtension.h
--- a/clang/include/clang/Serialization/ModuleFileExtension.h
+++ b/clang/include/clang/Serialization/ModuleFileExtension.h
@@ -154,4 +154,4 @@
 
 } // end namespace clang
 
-#endif // LLVM_CLANG_FRONTEND_MODULEFILEEXTENSION_H
+#endif // LLVM_CLANG_SERIALIZATION_MODULEFILEEXTENSION_H
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h b/clang/include/clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h
--- a/clang/include/clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h
+++ b/clang/include/clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_CLANGSACHECKERS_H
-#define LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_CLANGSACHECKERS_H
+#ifndef LLVM_CLANG_STATICANALYZER_CHECKERS_BUILTINCHECKERREGISTRATION_H
+#define LLVM_CLANG_STATICANALYZER_CHECKERS_BUILTINCHECKERREGISTRATION_H
 
 #include "clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h"
 
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h b/clang/include/clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h
--- a/clang/include/clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h
+++ b/clang/include/clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h
@@ -11,8 +11,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPIFUNCTIONCLASSIFIER_H
-#define LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_MPICHECKER_MPIFUNCTIONCLASSIFIER_H
+#ifndef LLVM_CLANG_STATICANALYZER_CHECKERS_MPIFUNCTIONCLASSIFIER_H
+#define LLVM_CLANG_STATICANALYZER_CHECKERS_MPIFUNCTIONCLASSIFIER_H
 
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_LIB_STATICANALYZER_CORE_RANGEDCONSTRAINTMANAGER_H
-#define LLVM_CLANG_LIB_STATICANALYZER_CORE_RANGEDCONSTRAINTMANAGER_H
+#ifndef LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_RANGEDCONSTRAINTMANAGER_H
+#define LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_RANGEDCONSTRAINTMANAGER_H
 
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
diff --git a/clang/include/clang/StaticAnalyzer/Frontend/ModelConsumer.h b/clang/include/clang/StaticAnalyzer/Frontend/ModelConsumer.h
--- a/clang/include/clang/StaticAnalyzer/Frontend/ModelConsumer.h
+++ b/clang/include/clang/StaticAnalyzer/Frontend/ModelConsumer.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_GR_MODELCONSUMER_H
-#define LLVM_CLANG_GR_MODELCONSUMER_H
+#ifndef LLVM_CLANG_STATICANALYZER_FRONTEND_MODELCONSUMER_H
+#define LLVM_CLANG_STATICANALYZER_FRONTEND_MODELCONSUMER_H
 
 #include "clang/AST/ASTConsumer.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/clang/include/clang/Tooling/CommonOptionsParser.h b/clang/include/clang/Tooling/CommonOptionsParser.h
--- a/clang/include/clang/Tooling/CommonOptionsParser.h
+++ b/clang/include/clang/Tooling/CommonOptionsParser.h
@@ -141,4 +141,4 @@
 }  // namespace tooling
 }  // namespace clang
 
-#endif  // LLVM_TOOLS_CLANG_INCLUDE_CLANG_TOOLING_COMMONOPTIONSPARSER_H
+#endif // LLVM_CLANG_TOOLING_COMMONOPTIONSPARSER_H
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_FILESYSTEM_H
-#define LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_FILESYSTEM_H
+#ifndef LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H
+#define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Lex/PreprocessorExcludedConditionalDirectiveSkipMapping.h"
@@ -255,4 +255,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_FILESYSTEM_H
+#endif // LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_SERVICE_H
-#define LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_SERVICE_H
+#ifndef LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGSERVICE_H
+#define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGSERVICE_H
 
 #include "clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h"
 
@@ -83,4 +83,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_SERVICE_H
+#endif // LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGSERVICE_H
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_TOOL_H
-#define LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_TOOL_H
+#ifndef LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGTOOL_H
+#define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGTOOL_H
 
 #include "clang/Tooling/DependencyScanning/DependencyScanningService.h"
 #include "clang/Tooling/DependencyScanning/DependencyScanningWorker.h"
@@ -111,4 +111,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_TOOL_H
+#endif // LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGTOOL_H
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_WORKER_H
-#define LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_WORKER_H
+#ifndef LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGWORKER_H
+#define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGWORKER_H
 
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileManager.h"
@@ -91,4 +91,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_WORKER_H
+#endif // LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGWORKER_H
diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
--- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
+++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_MODULE_DEP_COLLECTOR_H
-#define LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_MODULE_DEP_COLLECTOR_H
+#ifndef LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_MODULEDEPCOLLECTOR_H
+#define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_MODULEDEPCOLLECTOR_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceManager.h"
@@ -234,4 +234,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_DEPENDENCY_SCANNING_MODULE_DEP_COLLECTOR_H
+#endif // LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_MODULEDEPCOLLECTOR_H
diff --git a/clang/include/clang/Tooling/FixIt.h b/clang/include/clang/Tooling/FixIt.h
--- a/clang/include/clang/Tooling/FixIt.h
+++ b/clang/include/clang/Tooling/FixIt.h
@@ -76,4 +76,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_FIXINT_H
+#endif // LLVM_CLANG_TOOLING_FIXIT_H
diff --git a/clang/include/clang/Tooling/Refactoring/ASTSelection.h b/clang/include/clang/Tooling/Refactoring/ASTSelection.h
--- a/clang/include/clang/Tooling/Refactoring/ASTSelection.h
+++ b/clang/include/clang/Tooling/Refactoring/ASTSelection.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_AST_SELECTION_H
-#define LLVM_CLANG_TOOLING_REFACTOR_AST_SELECTION_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_ASTSELECTION_H
+#define LLVM_CLANG_TOOLING_REFACTORING_ASTSELECTION_H
 
 #include "clang/AST/ASTTypeTraits.h"
 #include "clang/AST/Stmt.h"
@@ -152,4 +152,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_AST_SELECTION_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_ASTSELECTION_H
diff --git a/clang/include/clang/Tooling/Refactoring/AtomicChange.h b/clang/include/clang/Tooling/Refactoring/AtomicChange.h
--- a/clang/include/clang/Tooling/Refactoring/AtomicChange.h
+++ b/clang/include/clang/Tooling/Refactoring/AtomicChange.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_ATOMICCHANGE_H
-#define LLVM_CLANG_TOOLING_REFACTOR_ATOMICCHANGE_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_ATOMICCHANGE_H
+#define LLVM_CLANG_TOOLING_REFACTORING_ATOMICCHANGE_H
 
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
@@ -187,4 +187,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_ATOMICCHANGE_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_ATOMICCHANGE_H
diff --git a/clang/include/clang/Tooling/Refactoring/Extract/Extract.h b/clang/include/clang/Tooling/Refactoring/Extract/Extract.h
--- a/clang/include/clang/Tooling/Refactoring/Extract/Extract.h
+++ b/clang/include/clang/Tooling/Refactoring/Extract/Extract.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_EXTRACT_EXTRACT_H
-#define LLVM_CLANG_TOOLING_REFACTOR_EXTRACT_EXTRACT_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_EXTRACT_H
+#define LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_EXTRACT_H
 
 #include "clang/Tooling/Refactoring/ASTSelection.h"
 #include "clang/Tooling/Refactoring/RefactoringActionRules.h"
@@ -49,4 +49,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_EXTRACT_EXTRACT_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_EXTRACT_H
diff --git a/clang/include/clang/Tooling/Refactoring/Extract/SourceExtraction.h b/clang/include/clang/Tooling/Refactoring/Extract/SourceExtraction.h
--- a/clang/include/clang/Tooling/Refactoring/Extract/SourceExtraction.h
+++ b/clang/include/clang/Tooling/Refactoring/Extract/SourceExtraction.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_SOURCE_EXTRACTION_H
-#define LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_SOURCE_EXTRACTION_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_SOURCEEXTRACTION_H
+#define LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_SOURCEEXTRACTION_H
 
 #include "clang/Basic/LLVM.h"
 
@@ -48,4 +48,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif //LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_SOURCE_EXTRACTION_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_EXTRACT_SOURCEEXTRACTION_H
diff --git a/clang/include/clang/Tooling/Refactoring/Lookup.h b/clang/include/clang/Tooling/Refactoring/Lookup.h
--- a/clang/include/clang/Tooling/Refactoring/Lookup.h
+++ b/clang/include/clang/Tooling/Refactoring/Lookup.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_LOOKUP_H
-#define LLVM_CLANG_TOOLING_REFACTOR_LOOKUP_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_LOOKUP_H
+#define LLVM_CLANG_TOOLING_REFACTORING_LOOKUP_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
@@ -47,4 +47,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_LOOKUP_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_LOOKUP_H
diff --git a/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h b/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
--- a/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
+++ b/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_RECURSIVE_SYMBOL_VISITOR_H
-#define LLVM_CLANG_TOOLING_REFACTOR_RECURSIVE_SYMBOL_VISITOR_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_RECURSIVESYMBOLVISITOR_H
+#define LLVM_CLANG_TOOLING_REFACTORING_RECURSIVESYMBOLVISITOR_H
 
 #include "clang/AST/AST.h"
 #include "clang/AST/RecursiveASTVisitor.h"
@@ -150,4 +150,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_RECURSIVE_SYMBOL_VISITOR_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_RECURSIVESYMBOLVISITOR_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringAction.h b/clang/include/clang/Tooling/Refactoring/RefactoringAction.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringAction.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringAction.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTION_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTION_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Tooling/Refactoring/RefactoringActionRules.h"
@@ -60,4 +60,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTION_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringActionRule.h b/clang/include/clang/Tooling/Refactoring/RefactoringActionRule.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringActionRule.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringActionRule.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULE_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULE_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULE_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULE_H
 
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/Optional.h"
@@ -69,4 +69,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULE_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULE_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringActionRuleRequirements.h b/clang/include/clang/Tooling/Refactoring/RefactoringActionRuleRequirements.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringActionRuleRequirements.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringActionRuleRequirements.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULE_REQUIREMENTS_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULE_REQUIREMENTS_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULEREQUIREMENTS_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULEREQUIREMENTS_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Tooling/Refactoring/ASTSelection.h"
@@ -119,4 +119,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULE_REQUIREMENTS_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULEREQUIREMENTS_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringActionRules.h b/clang/include/clang/Tooling/Refactoring/RefactoringActionRules.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringActionRules.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringActionRules.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULES_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULES_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULES_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULES_H
 
 #include "clang/Tooling/Refactoring/RefactoringActionRule.h"
 #include "clang/Tooling/Refactoring/RefactoringActionRulesInternal.h"
@@ -90,4 +90,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULES_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULES_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringActionRulesInternal.h b/clang/include/clang/Tooling/Refactoring/RefactoringActionRulesInternal.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringActionRulesInternal.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringActionRulesInternal.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULES_INTERNAL_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULES_INTERNAL_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULESINTERNAL_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULESINTERNAL_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Tooling/Refactoring/RefactoringActionRule.h"
@@ -154,4 +154,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_ACTION_RULES_INTERNAL_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGACTIONRULESINTERNAL_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringOption.h b/clang/include/clang/Tooling/Refactoring/RefactoringOption.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringOption.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringOption.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTION_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTION_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTION_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTION_H
 
 #include "clang/Basic/LLVM.h"
 #include <memory>
@@ -60,4 +60,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTION_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTION_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringOptionVisitor.h b/clang/include/clang/Tooling/Refactoring/RefactoringOptionVisitor.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringOptionVisitor.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringOptionVisitor.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTION_VISITOR_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTION_VISITOR_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTIONVISITOR_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTIONVISITOR_H
 
 #include "clang/Basic/LLVM.h"
 #include <type_traits>
@@ -58,4 +58,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTION_VISITOR_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTIONVISITOR_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringOptions.h b/clang/include/clang/Tooling/Refactoring/RefactoringOptions.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringOptions.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringOptions.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTIONS_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTIONS_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTIONS_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTIONS_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Tooling/Refactoring/RefactoringActionRuleRequirements.h"
@@ -54,4 +54,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_OPTIONS_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGOPTIONS_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringResultConsumer.h b/clang/include/clang/Tooling/Refactoring/RefactoringResultConsumer.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringResultConsumer.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringResultConsumer.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_RESULT_CONSUMER_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_RESULT_CONSUMER_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGRESULTCONSUMER_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGRESULTCONSUMER_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Tooling/Refactoring/AtomicChange.h"
@@ -48,4 +48,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_RESULT_CONSUMER_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGRESULTCONSUMER_H
diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringRuleContext.h b/clang/include/clang/Tooling/Refactoring/RefactoringRuleContext.h
--- a/clang/include/clang/Tooling/Refactoring/RefactoringRuleContext.h
+++ b/clang/include/clang/Tooling/Refactoring/RefactoringRuleContext.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_RULE_CONTEXT_H
-#define LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_RULE_CONTEXT_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGRULECONTEXT_H
+#define LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGRULECONTEXT_H
 
 #include "clang/Basic/DiagnosticError.h"
 #include "clang/Basic/SourceManager.h"
@@ -86,4 +86,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_REFACTORING_RULE_CONTEXT_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_REFACTORINGRULECONTEXT_H
diff --git a/clang/include/clang/Tooling/Refactoring/Rename/RenamingAction.h b/clang/include/clang/Tooling/Refactoring/Rename/RenamingAction.h
--- a/clang/include/clang/Tooling/Refactoring/Rename/RenamingAction.h
+++ b/clang/include/clang/Tooling/Refactoring/Rename/RenamingAction.h
@@ -11,8 +11,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_RENAME_RENAMING_ACTION_H
-#define LLVM_CLANG_TOOLING_REFACTOR_RENAME_RENAMING_ACTION_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_RENAME_RENAMINGACTION_H
+#define LLVM_CLANG_TOOLING_REFACTORING_RENAME_RENAMINGACTION_H
 
 #include "clang/Tooling/Refactoring.h"
 #include "clang/Tooling/Refactoring/AtomicChange.h"
@@ -120,4 +120,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_RENAME_RENAMING_ACTION_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_RENAME_RENAMINGACTION_H
diff --git a/clang/include/clang/Tooling/Refactoring/Rename/SymbolName.h b/clang/include/clang/Tooling/Refactoring/Rename/SymbolName.h
--- a/clang/include/clang/Tooling/Refactoring/Rename/SymbolName.h
+++ b/clang/include/clang/Tooling/Refactoring/Rename/SymbolName.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_RENAME_SYMBOL_NAME_H
-#define LLVM_CLANG_TOOLING_REFACTOR_RENAME_SYMBOL_NAME_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_RENAME_SYMBOLNAME_H
+#define LLVM_CLANG_TOOLING_REFACTORING_RENAME_SYMBOLNAME_H
 
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -45,4 +45,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_RENAME_SYMBOL_NAME_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_RENAME_SYMBOLNAME_H
diff --git a/clang/include/clang/Tooling/Refactoring/Rename/SymbolOccurrences.h b/clang/include/clang/Tooling/Refactoring/Rename/SymbolOccurrences.h
--- a/clang/include/clang/Tooling/Refactoring/Rename/SymbolOccurrences.h
+++ b/clang/include/clang/Tooling/Refactoring/Rename/SymbolOccurrences.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_RENAME_SYMBOL_OCCURRENCES_H
-#define LLVM_CLANG_TOOLING_REFACTOR_RENAME_SYMBOL_OCCURRENCES_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_RENAME_SYMBOLOCCURRENCES_H
+#define LLVM_CLANG_TOOLING_REFACTORING_RENAME_SYMBOLOCCURRENCES_H
 
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
@@ -88,4 +88,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_RENAME_SYMBOL_OCCURRENCES_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_RENAME_SYMBOLOCCURRENCES_H
diff --git a/clang/include/clang/Tooling/Refactoring/Rename/USRFinder.h b/clang/include/clang/Tooling/Refactoring/Rename/USRFinder.h
--- a/clang/include/clang/Tooling/Refactoring/Rename/USRFinder.h
+++ b/clang/include/clang/Tooling/Refactoring/Rename/USRFinder.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_FINDER_H
-#define LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_FINDER_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRFINDER_H
+#define LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRFINDER_H
 
 #include "clang/AST/AST.h"
 #include "clang/AST/ASTContext.h"
@@ -46,4 +46,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_FINDER_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRFINDER_H
diff --git a/clang/include/clang/Tooling/Refactoring/Rename/USRFindingAction.h b/clang/include/clang/Tooling/Refactoring/Rename/USRFindingAction.h
--- a/clang/include/clang/Tooling/Refactoring/Rename/USRFindingAction.h
+++ b/clang/include/clang/Tooling/Refactoring/Rename/USRFindingAction.h
@@ -11,8 +11,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_FINDING_ACTION_H
-#define LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_FINDING_ACTION_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRFINDINGACTION_H
+#define LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRFINDINGACTION_H
 
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -64,4 +64,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_FINDING_ACTION_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRFINDINGACTION_H
diff --git a/clang/include/clang/Tooling/Refactoring/Rename/USRLocFinder.h b/clang/include/clang/Tooling/Refactoring/Rename/USRLocFinder.h
--- a/clang/include/clang/Tooling/Refactoring/Rename/USRLocFinder.h
+++ b/clang/include/clang/Tooling/Refactoring/Rename/USRLocFinder.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_LOC_FINDER_H
-#define LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_LOC_FINDER_H
+#ifndef LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRLOCFINDER_H
+#define LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRLOCFINDER_H
 
 #include "clang/AST/AST.h"
 #include "clang/Tooling/Core/Replacement.h"
@@ -49,4 +49,4 @@
 } // end namespace tooling
 } // end namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_RENAME_USR_LOC_FINDER_H
+#endif // LLVM_CLANG_TOOLING_REFACTORING_RENAME_USRLOCFINDER_H
diff --git a/clang/include/clang/Tooling/Syntax/BuildTree.h b/clang/include/clang/Tooling/Syntax/BuildTree.h
--- a/clang/include/clang/Tooling/Syntax/BuildTree.h
+++ b/clang/include/clang/Tooling/Syntax/BuildTree.h
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 // Functions to construct a syntax tree from an AST.
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_H
-#define LLVM_CLANG_TOOLING_SYNTAX_TREE_H
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_BUILDTREE_H
+#define LLVM_CLANG_TOOLING_SYNTAX_BUILDTREE_H
 
 #include "clang/AST/Decl.h"
 #include "clang/Basic/TokenKinds.h"
diff --git a/clang/include/clang/Tooling/Syntax/Tree.h b/clang/include/clang/Tooling/Syntax/Tree.h
--- a/clang/include/clang/Tooling/Syntax/Tree.h
+++ b/clang/include/clang/Tooling/Syntax/Tree.h
@@ -18,8 +18,8 @@
 // This is still work in progress and highly experimental, we leave room for
 // ourselves to completely change the design and/or implementation.
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_CASCADE_H
-#define LLVM_CLANG_TOOLING_SYNTAX_TREE_CASCADE_H
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TREE_H
 
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
diff --git a/clang/include/clang/Tooling/Transformer/MatchConsumer.h b/clang/include/clang/Tooling/Transformer/MatchConsumer.h
--- a/clang/include/clang/Tooling/Transformer/MatchConsumer.h
+++ b/clang/include/clang/Tooling/Transformer/MatchConsumer.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_MATCH_CONSUMER_H_
-#define LLVM_CLANG_TOOLING_TRANSFORMER_MATCH_CONSUMER_H_
+#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_MATCHCONSUMER_H
+#define LLVM_CLANG_TOOLING_TRANSFORMER_MATCHCONSUMER_H
 
 #include "clang/AST/ASTTypeTraits.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -100,4 +100,4 @@
 }
 } // namespace transformer
 } // namespace clang
-#endif // LLVM_CLANG_TOOLING_TRANSFORMER_MATCH_CONSUMER_H_
+#endif // LLVM_CLANG_TOOLING_TRANSFORMER_MATCHCONSUMER_H
diff --git a/clang/include/clang/Tooling/Transformer/Parsing.h b/clang/include/clang/Tooling/Transformer/Parsing.h
--- a/clang/include/clang/Tooling/Transformer/Parsing.h
+++ b/clang/include/clang/Tooling/Transformer/Parsing.h
@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_PARSING_H_
-#define LLVM_CLANG_TOOLING_REFACTOR_PARSING_H_
+#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_PARSING_H
+#define LLVM_CLANG_TOOLING_TRANSFORMER_PARSING_H
 
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/Basic/SourceLocation.h"
@@ -37,4 +37,4 @@
 } // namespace transformer
 } // namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_PARSING_H_
+#endif // LLVM_CLANG_TOOLING_TRANSFORMER_PARSING_H
diff --git a/clang/include/clang/Tooling/Transformer/RangeSelector.h b/clang/include/clang/Tooling/Transformer/RangeSelector.h
--- a/clang/include/clang/Tooling/Transformer/RangeSelector.h
+++ b/clang/include/clang/Tooling/Transformer/RangeSelector.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_REFACTOR_RANGE_SELECTOR_H_
-#define LLVM_CLANG_TOOLING_REFACTOR_RANGE_SELECTOR_H_
+#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_RANGESELECTOR_H
+#define LLVM_CLANG_TOOLING_TRANSFORMER_RANGESELECTOR_H
 
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/Basic/SourceLocation.h"
@@ -105,4 +105,4 @@
 } // namespace transformer
 } // namespace clang
 
-#endif // LLVM_CLANG_TOOLING_REFACTOR_RANGE_SELECTOR_H_
+#endif // LLVM_CLANG_TOOLING_TRANSFORMER_RANGESELECTOR_H
diff --git a/clang/include/clang/Tooling/Transformer/RewriteRule.h b/clang/include/clang/Tooling/Transformer/RewriteRule.h
--- a/clang/include/clang/Tooling/Transformer/RewriteRule.h
+++ b/clang/include/clang/Tooling/Transformer/RewriteRule.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_REWRITE_RULE_H_
-#define LLVM_CLANG_TOOLING_TRANSFORMER_REWRITE_RULE_H_
+#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_REWRITERULE_H
+#define LLVM_CLANG_TOOLING_TRANSFORMER_REWRITERULE_H
 
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -450,4 +450,4 @@
 } // namespace transformer
 } // namespace clang
 
-#endif // LLVM_CLANG_TOOLING_TRANSFORMER_REWRITE_RULE_H_
+#endif // LLVM_CLANG_TOOLING_TRANSFORMER_REWRITERULE_H
diff --git a/clang/include/clang/Tooling/Transformer/SourceCode.h b/clang/include/clang/Tooling/Transformer/SourceCode.h
--- a/clang/include/clang/Tooling/Transformer/SourceCode.h
+++ b/clang/include/clang/Tooling/Transformer/SourceCode.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_SOURCE_CODE_H
-#define LLVM_CLANG_TOOLING_TRANSFORMER_SOURCE_CODE_H
+#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_SOURCECODE_H
+#define LLVM_CLANG_TOOLING_TRANSFORMER_SOURCECODE_H
 
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/SourceLocation.h"
@@ -100,4 +100,4 @@
 }
 } // namespace tooling
 } // namespace clang
-#endif // LLVM_CLANG_TOOLING_TRANSFORMER_SOURCE_CODE_H
+#endif // LLVM_CLANG_TOOLING_TRANSFORMER_SOURCECODE_H
diff --git a/clang/include/clang/Tooling/Transformer/SourceCodeBuilders.h b/clang/include/clang/Tooling/Transformer/SourceCodeBuilders.h
--- a/clang/include/clang/Tooling/Transformer/SourceCodeBuilders.h
+++ b/clang/include/clang/Tooling/Transformer/SourceCodeBuilders.h
@@ -11,8 +11,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_SOURCE_CODE_BUILDERS_H_
-#define LLVM_CLANG_TOOLING_TRANSFORMER_SOURCE_CODE_BUILDERS_H_
+#ifndef LLVM_CLANG_TOOLING_TRANSFORMER_SOURCECODEBUILDERS_H
+#define LLVM_CLANG_TOOLING_TRANSFORMER_SOURCECODEBUILDERS_H
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Expr.h"
@@ -83,4 +83,4 @@
 
 } // namespace tooling
 } // namespace clang
-#endif // LLVM_CLANG_TOOLING_TRANSFORMER_SOURCE_CODE_BUILDERS_H_
+#endif // LLVM_CLANG_TOOLING_TRANSFORMER_SOURCECODEBUILDERS_H
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -8476,8 +8476,8 @@
     FieldDecl *Field = FieldDecl::Create(
         const_cast<ASTContext &>(*Context), VaListTagDecl, SourceLocation(),
         SourceLocation(), &Context->Idents.get(FieldNames[i]), FieldTypes[i],
-        /*TInfo=*/0,
-        /*BitWidth=*/0,
+        /*TInfo=*/nullptr,
+        /*BitWidth=*/nullptr,
         /*Mutable=*/false, ICIS_NoInit);
     Field->setAccess(AS_public);
     VaListTagDecl->addDecl(Field);
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -10680,28 +10680,55 @@
   bool HadZeroInit = Value->hasValue();
 
   if (const ConstantArrayType *CAT = Info.Ctx.getAsConstantArrayType(Type)) {
-    unsigned N = CAT->getSize().getZExtValue();
+    unsigned FinalSize = CAT->getSize().getZExtValue();
 
     // Preserve the array filler if we had prior zero-initialization.
     APValue Filler =
       HadZeroInit && Value->hasArrayFiller() ? Value->getArrayFiller()
                                              : APValue();
 
-    *Value = APValue(APValue::UninitArray(), N, N);
-
-    if (HadZeroInit)
-      for (unsigned I = 0; I != N; ++I)
-        Value->getArrayInitializedElt(I) = Filler;
+    *Value = APValue(APValue::UninitArray(), 0, FinalSize);
+    if (FinalSize == 0)
+      return true;
 
-    // Initialize the elements.
     LValue ArrayElt = Subobject;
     ArrayElt.addArray(Info, E, CAT);
-    for (unsigned I = 0; I != N; ++I)
-      if (!VisitCXXConstructExpr(E, ArrayElt, &Value->getArrayInitializedElt(I),
-                                 CAT->getElementType()) ||
-          !HandleLValueArrayAdjustment(Info, E, ArrayElt, CAT->getElementType(),
-                                       1))
-        return false;
+    // We do the whole initialization in two passes, first for just one element,
+    // then for the whole array. It's possible we may find out we can't do const
+    // init in the first pass, in which case we avoid allocating a potentially
+    // large array. We don't do more passes because expanding array requires
+    // copying the data, which is wasteful.
+    for (const unsigned N : {1u, FinalSize}) {
+      unsigned OldElts = Value->getArrayInitializedElts();
+      if (OldElts == N)
+        break;
+
+      // Expand the array to appropriate size.
+      APValue NewValue(APValue::UninitArray(), N, FinalSize);
+      for (unsigned I = 0; I < OldElts; ++I)
+        NewValue.getArrayInitializedElt(I).swap(
+            Value->getArrayInitializedElt(I));
+      Value->swap(NewValue);
+
+      if (HadZeroInit)
+        for (unsigned I = OldElts; I < N; ++I)
+          Value->getArrayInitializedElt(I) = Filler;
+
+      // Initialize the elements.
+      for (unsigned I = OldElts; I < N; ++I) {
+        if (!VisitCXXConstructExpr(E, ArrayElt,
+                                   &Value->getArrayInitializedElt(I),
+                                   CAT->getElementType()) ||
+            !HandleLValueArrayAdjustment(Info, E, ArrayElt,
+                                         CAT->getElementType(), 1))
+          return false;
+        // When checking for const initilization any diagnostic is considered
+        // an error.
+        if (Info.EvalStatus.Diag && !Info.EvalStatus.Diag->empty() &&
+            !Info.keepEvaluatingAfterFailure())
+          return false;
+      }
+    }
 
     return true;
   }
diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp
--- a/clang/lib/AST/FormatString.cpp
+++ b/clang/lib/AST/FormatString.cpp
@@ -21,7 +21,6 @@
 using clang::analyze_format_string::FormatSpecifier;
 using clang::analyze_format_string::LengthModifier;
 using clang::analyze_format_string::OptionalAmount;
-using clang::analyze_format_string::PositionContext;
 using clang::analyze_format_string::ConversionSpecifier;
 using namespace clang;
 
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -194,7 +194,7 @@
   ID.AddInteger(ArraySize.getZExtValue());
   ID.AddInteger(SizeMod);
   ID.AddInteger(TypeQuals);
-  ID.AddBoolean(SizeExpr != 0);
+  ID.AddBoolean(SizeExpr != nullptr);
   if (SizeExpr)
     SizeExpr->Profile(ID, Context, true);
 }
diff --git a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
--- a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_clang_library(clangAnalysisFlowSensitive
+  ControlFlowContext.cpp
   TypeErasedDataflowAnalysis.cpp
 
   LINK_LIBS
diff --git a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
new file mode 100644
--- /dev/null
+++ b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
@@ -0,0 +1,68 @@
+//===- ControlFlowContext.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines a ControlFlowContext class that is used by dataflow
+//  analyses that run over Control-Flow Graphs (CFGs).
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/Stmt.h"
+#include "clang/Analysis/CFG.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Error.h"
+#include <utility>
+
+namespace clang {
+namespace dataflow {
+
+/// Returns a map from statements to basic blocks that contain them.
+static llvm::DenseMap<const Stmt *, const CFGBlock *>
+buildStmtToBasicBlockMap(const CFG &Cfg) {
+  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+  for (const CFGBlock *Block : Cfg) {
+    if (Block == nullptr)
+      continue;
+
+    for (const CFGElement &Element : *Block) {
+      auto Stmt = Element.getAs<CFGStmt>();
+      if (!Stmt.hasValue())
+        continue;
+
+      StmtToBlock[Stmt.getValue().getStmt()] = Block;
+    }
+  }
+  return StmtToBlock;
+}
+
+llvm::Expected<ControlFlowContext>
+ControlFlowContext::build(const Decl *D, Stmt *S, ASTContext *C) {
+  CFG::BuildOptions Options;
+  Options.PruneTriviallyFalseEdges = false;
+  Options.AddImplicitDtors = true;
+  Options.AddTemporaryDtors = true;
+  Options.AddInitializers = true;
+
+  // Ensure that all sub-expressions in basic blocks are evaluated.
+  Options.setAllAlwaysAdd();
+
+  auto Cfg = CFG::buildCFG(D, S, C, Options);
+  if (Cfg == nullptr)
+    return llvm::createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "CFG::buildCFG failed");
+
+  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock =
+      buildStmtToBasicBlockMap(*Cfg);
+  return ControlFlowContext(std::move(Cfg), std::move(StmtToBlock));
+}
+
+} // namespace dataflow
+} // namespace clang
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -35,6 +35,7 @@
 ///   already been transferred. States in `BlockStates` that are set to
 ///   `llvm::None` represent basic blocks that are not evaluated yet.
 static TypeErasedDataflowAnalysisState computeBlockInputState(
+    const ControlFlowContext &CFCtx,
     std::vector<llvm::Optional<TypeErasedDataflowAnalysisState>> &BlockStates,
     const CFGBlock &Block, const Environment &InitEnv,
     TypeErasedDataflowAnalysis &Analysis) {
@@ -43,7 +44,40 @@
   // the state of each basic block differently.
   TypeErasedDataflowAnalysisState State = {Analysis.typeErasedInitialElement(),
                                            InitEnv};
-  for (const CFGBlock *Pred : Block.preds()) {
+
+  llvm::DenseSet<const CFGBlock *> Preds;
+  Preds.insert(Block.pred_begin(), Block.pred_end());
+  if (Block.getTerminator().isTemporaryDtorsBranch()) {
+    // This handles a special case where the code that produced the CFG includes
+    // a conditional operator with a branch that constructs a temporary and
+    // calls a destructor annotated as noreturn. The CFG models this as follows:
+    //
+    // B1 (contains the condition of the conditional operator) - succs: B2, B3
+    // B2 (contains code that does not call a noreturn destructor) - succs: B4
+    // B3 (contains code that calls a noreturn destructor) - succs: B4
+    // B4 (has temporary destructor terminator) - succs: B5, B6
+    // B5 (noreturn block that is associated with the noreturn destructor call)
+    // B6 (contains code that follows the conditional operator statement)
+    //
+    // The first successor (B5 above) of a basic block with a temporary
+    // destructor terminator (B4 above) is the block that evaluates the
+    // destructor. If that block has a noreturn element then the predecessor
+    // block that constructed the temporary object (B3 above) is effectively a
+    // noreturn block and its state should not be used as input for the state
+    // of the block that has a temporary destructor terminator (B4 above). This
+    // holds regardless of which branch of the ternary operator calls the
+    // noreturn destructor. However, it doesn't cases where a nested ternary
+    // operator includes a branch that contains a noreturn destructor call.
+    //
+    // See `NoreturnDestructorTest` for concrete examples.
+    if (Block.succ_begin()->getReachableBlock()->hasNoReturnElement()) {
+      auto StmtBlock = CFCtx.getStmtToBlock().find(Block.getTerminatorStmt());
+      assert(StmtBlock != CFCtx.getStmtToBlock().end());
+      Preds.erase(StmtBlock->getSecond());
+    }
+  }
+
+  for (const CFGBlock *Pred : Preds) {
     // Skip if the `Block` is unreachable or control flow cannot get past it.
     if (!Pred || Pred->hasNoReturnElement())
       continue;
@@ -64,6 +98,7 @@
 }
 
 TypeErasedDataflowAnalysisState transferBlock(
+    const ControlFlowContext &CFCtx,
     std::vector<llvm::Optional<TypeErasedDataflowAnalysisState>> &BlockStates,
     const CFGBlock &Block, const Environment &InitEnv,
     TypeErasedDataflowAnalysis &Analysis,
@@ -71,7 +106,7 @@
                        const TypeErasedDataflowAnalysisState &)>
         HandleTransferredStmt) {
   TypeErasedDataflowAnalysisState State =
-      computeBlockInputState(BlockStates, Block, InitEnv, Analysis);
+      computeBlockInputState(CFCtx, BlockStates, Block, InitEnv, Analysis);
   for (const CFGElement &Element : Block) {
     // FIXME: Evaluate other kinds of `CFGElement`.
     const llvm::Optional<CFGStmt> Stmt = Element.getAs<CFGStmt>();
@@ -89,21 +124,21 @@
 }
 
 std::vector<llvm::Optional<TypeErasedDataflowAnalysisState>>
-runTypeErasedDataflowAnalysis(const CFG &Cfg,
+runTypeErasedDataflowAnalysis(const ControlFlowContext &CFCtx,
                               TypeErasedDataflowAnalysis &Analysis,
                               const Environment &InitEnv) {
   // FIXME: Consider enforcing that `Cfg` meets the requirements that
   // are specified in the header. This could be done by remembering
   // what options were used to build `Cfg` and asserting on them here.
 
-  PostOrderCFGView POV(&Cfg);
-  ForwardDataflowWorklist Worklist(Cfg, &POV);
+  PostOrderCFGView POV(&CFCtx.getCFG());
+  ForwardDataflowWorklist Worklist(CFCtx.getCFG(), &POV);
 
   std::vector<llvm::Optional<TypeErasedDataflowAnalysisState>> BlockStates;
-  BlockStates.resize(Cfg.size(), llvm::None);
+  BlockStates.resize(CFCtx.getCFG().size(), llvm::None);
 
   // The entry basic block doesn't contain statements so it can be skipped.
-  const CFGBlock &Entry = Cfg.getEntry();
+  const CFGBlock &Entry = CFCtx.getCFG().getEntry();
   BlockStates[Entry.getBlockID()] = {Analysis.typeErasedInitialElement(),
                                      InitEnv};
   Worklist.enqueueSuccessors(&Entry);
@@ -125,7 +160,7 @@
     const llvm::Optional<TypeErasedDataflowAnalysisState> &OldBlockState =
         BlockStates[Block->getBlockID()];
     TypeErasedDataflowAnalysisState NewBlockState =
-        transferBlock(BlockStates, *Block, InitEnv, Analysis);
+        transferBlock(CFCtx, BlockStates, *Block, InitEnv, Analysis);
 
     if (OldBlockState.hasValue() &&
         Analysis.isEqualTypeErased(OldBlockState.getValue().Lattice,
diff --git a/clang/lib/Basic/TargetID.cpp b/clang/lib/Basic/TargetID.cpp
--- a/clang/lib/Basic/TargetID.cpp
+++ b/clang/lib/Basic/TargetID.cpp
@@ -15,7 +15,7 @@
 
 namespace clang {
 
-static const llvm::SmallVector<llvm::StringRef, 4>
+static llvm::SmallVector<llvm::StringRef, 4>
 getAllPossibleAMDGPUTargetIDFeatures(const llvm::Triple &T,
                                      llvm::StringRef Proc) {
   // Entries in returned vector should be in alphabetical order.
@@ -33,7 +33,7 @@
   return Ret;
 }
 
-const llvm::SmallVector<llvm::StringRef, 4>
+llvm::SmallVector<llvm::StringRef, 4>
 getAllPossibleTargetIDFeatures(const llvm::Triple &T,
                                llvm::StringRef Processor) {
   llvm::SmallVector<llvm::StringRef, 4> Ret;
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -390,7 +390,7 @@
     llvm::PHINode *PHI = Builder.CreatePHI(BasePtrTy, 2, "cast.result");
     PHI->addIncoming(Value.getPointer(), notNullBB);
     PHI->addIncoming(llvm::Constant::getNullValue(BasePtrTy), origBB);
-    Value = Address(PHI, Value.getAlignment());
+    Value = Value.withPointer(PHI);
   }
 
   return Value;
@@ -1983,7 +1983,7 @@
   CharUnits eltAlignment =
     arrayBase.getAlignment()
              .alignmentOfArrayElement(getContext().getTypeSizeInChars(type));
-  Address curAddr = Address(cur, eltAlignment);
+  Address curAddr = Address(cur, elementType, eltAlignment);
 
   // Zero initialize the storage, if requested.
   if (zeroInitialize)
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -363,7 +363,7 @@
   /// Extended dereferencing mechanism is has the following format:
   ///     DW_OP_constu <DWARF Address Space> DW_OP_swap DW_OP_xderef
   void AppendAddressSpaceXDeref(unsigned AddressSpace,
-                                SmallVectorImpl<int64_t> &Expr) const;
+                                SmallVectorImpl<uint64_t> &Expr) const;
 
   /// A helper function to collect debug info for the default elements of a
   /// block.
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -722,7 +722,7 @@
       auto *LowerBound =
           llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(
               llvm::Type::getInt64Ty(CGM.getLLVMContext()), 0));
-      SmallVector<int64_t, 9> Expr(
+      SmallVector<uint64_t, 9> Expr(
           {llvm::dwarf::DW_OP_constu, NumElemsPerVG, llvm::dwarf::DW_OP_bregx,
            /* AArch64::VG */ 46, 0, llvm::dwarf::DW_OP_mul,
            llvm::dwarf::DW_OP_constu, 1, llvm::dwarf::DW_OP_minus});
@@ -768,7 +768,7 @@
       }
 
       // Element count = (VLENB / SEW) x LMUL
-      SmallVector<int64_t, 12> Expr(
+      SmallVector<uint64_t, 12> Expr(
           // The DW_OP_bregx operation has two operands: a register which is
           // specified by an unsigned LEB128 number, followed by a signed LEB128
           // offset.
@@ -4325,7 +4325,7 @@
 }
 
 void CGDebugInfo::AppendAddressSpaceXDeref(
-    unsigned AddressSpace, SmallVectorImpl<int64_t> &Expr) const {
+    unsigned AddressSpace, SmallVectorImpl<uint64_t> &Expr) const {
   Optional<unsigned> DWARFAddressSpace =
       CGM.getTarget().getDWARFAddressSpace(AddressSpace);
   if (!DWARFAddressSpace)
@@ -4494,7 +4494,7 @@
     Line = getLineNumber(VD->getLocation());
     Column = getColumnNumber(VD->getLocation());
   }
-  SmallVector<int64_t, 13> Expr;
+  SmallVector<uint64_t, 13> Expr;
   llvm::DINode::DIFlags Flags = llvm::DINode::FlagZero;
   if (VD->isImplicit())
     Flags |= llvm::DINode::FlagArtificial;
@@ -4720,7 +4720,7 @@
       target.getStructLayout(blockInfo.StructureType)
           ->getElementOffset(blockInfo.getCapture(VD).getIndex()));
 
-  SmallVector<int64_t, 9> addr;
+  SmallVector<uint64_t, 9> addr;
   addr.push_back(llvm::dwarf::DW_OP_deref);
   addr.push_back(llvm::dwarf::DW_OP_plus_uconst);
   addr.push_back(offset.getQuantity());
@@ -5191,7 +5191,7 @@
   } else {
     auto Align = getDeclAlignIfRequired(D, CGM.getContext());
 
-    SmallVector<int64_t, 4> Expr;
+    SmallVector<uint64_t, 4> Expr;
     unsigned AddressSpace =
         CGM.getContext().getTargetAddressSpace(D->getType());
     if (CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) {
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1392,9 +1392,11 @@
     else {
       // Create an artificial VarDecl to generate debug info for.
       IdentifierInfo *NameIdent = VLAExprNames[NameIdx++];
-      auto VlaExprTy = VlaSize.NumElts->getType()->getPointerElementType();
+      assert(cast<llvm::PointerType>(VlaSize.NumElts->getType())
+                 ->isOpaqueOrPointeeTypeMatches(SizeTy) &&
+             "Number of VLA elements must be SizeTy");
       auto QT = getContext().getIntTypeForBitwidth(
-          VlaExprTy->getScalarSizeInBits(), false);
+          SizeTy->getScalarSizeInBits(), false);
       auto *ArtificialDecl = VarDecl::Create(
           getContext(), const_cast<DeclContext *>(D.getDeclContext()),
           D.getLocation(), D.getLocation(), NameIdent, QT,
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -614,8 +614,8 @@
       //   every temporary created in a default argument is sequenced before
       //   the construction of the next array element, if any
       CodeGenFunction::RunCleanupsScope CleanupsScope(CGF);
-      LValue elementLV =
-        CGF.MakeAddrLValue(Address(currentElement, elementAlign), elementType);
+      LValue elementLV = CGF.MakeAddrLValue(
+          Address(currentElement, llvmElementType, elementAlign), elementType);
       if (filler)
         EmitInitializationToLValue(filler, elementLV);
       else
@@ -1801,6 +1801,7 @@
   CharUnits elementSize = CGF.getContext().getTypeSizeInChars(elementType);
   CharUnits elementAlign =
       destPtr.getAlignment().alignmentOfArrayElement(elementSize);
+  llvm::Type *llvmElementType = CGF.ConvertTypeForMem(elementType);
 
   llvm::BasicBlock *entryBB = Builder.GetInsertBlock();
   llvm::BasicBlock *bodyBB = CGF.createBasicBlock("arrayinit.body");
@@ -1810,8 +1811,8 @@
   llvm::PHINode *index =
       Builder.CreatePHI(zero->getType(), 2, "arrayinit.index");
   index->addIncoming(zero, entryBB);
-  llvm::Value *element = Builder.CreateInBoundsGEP(
-      begin->getType()->getPointerElementType(), begin, index);
+  llvm::Value *element =
+      Builder.CreateInBoundsGEP(llvmElementType, begin, index);
 
   // Prepare for a cleanup.
   QualType::DestructionKind dtorKind = elementType.isDestructedType();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1429,10 +1429,11 @@
 llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF,
                                                  SourceLocation Loc,
                                                  unsigned Flags) {
+  uint32_t SrcLocStrSize;
   llvm::Constant *SrcLocStr;
   if (CGM.getCodeGenOpts().getDebugInfo() == codegenoptions::NoDebugInfo ||
       Loc.isInvalid()) {
-    SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr();
+    SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
   } else {
     std::string FunctionName;
     if (const auto *FD = dyn_cast_or_null<FunctionDecl>(CGF.CurFuncDecl))
@@ -1441,12 +1442,12 @@
     const char *FileName = PLoc.getFilename();
     unsigned Line = PLoc.getLine();
     unsigned Column = PLoc.getColumn();
-    SrcLocStr =
-        OMPBuilder.getOrCreateSrcLocStr(FunctionName, FileName, Line, Column);
+    SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(FunctionName, FileName, Line,
+                                                Column, SrcLocStrSize);
   }
   unsigned Reserved2Flags = getDefaultLocationReserved2Flags();
-  return OMPBuilder.getOrCreateIdent(SrcLocStr, llvm::omp::IdentFlag(Flags),
-                                     Reserved2Flags);
+  return OMPBuilder.getOrCreateIdent(
+      SrcLocStr, SrcLocStrSize, llvm::omp::IdentFlag(Flags), Reserved2Flags);
 }
 
 llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
@@ -1457,10 +1458,11 @@
   if (CGM.getLangOpts().OpenMPIRBuilder) {
     SmallString<128> Buffer;
     OMPBuilder.updateToLocation(CGF.Builder.saveIP());
+    uint32_t SrcLocStrSize;
     auto *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(
-        getIdentStringFromSourceLocation(CGF, Loc, Buffer));
+        getIdentStringFromSourceLocation(CGF, Loc, Buffer), SrcLocStrSize);
     return OMPBuilder.getOrCreateThreadID(
-        OMPBuilder.getOrCreateIdent(SrcLocStr));
+        OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize));
   }
 
   llvm::Value *ThreadID = nullptr;
@@ -9527,8 +9529,9 @@
 emitMappingInformation(CodeGenFunction &CGF, llvm::OpenMPIRBuilder &OMPBuilder,
                        MappableExprsHandler::MappingExprInfo &MapExprs) {
 
+  uint32_t SrcLocStrSize;
   if (!MapExprs.getMapDecl() && !MapExprs.getMapExpr())
-    return OMPBuilder.getOrCreateDefaultSrcLocStr();
+    return OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
 
   SourceLocation Loc;
   if (!MapExprs.getMapDecl() && MapExprs.getMapExpr()) {
@@ -9552,7 +9555,8 @@
 
   PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc);
   return OMPBuilder.getOrCreateSrcLocStr(PLoc.getFilename(), ExprName,
-                                         PLoc.getLine(), PLoc.getColumn());
+                                         PLoc.getLine(), PLoc.getColumn(),
+                                         SrcLocStrSize);
 }
 
 /// Emit the arrays used to pass the captures and map information to the
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1402,10 +1402,14 @@
 
     // Allocate space for the variable to be globalized
     llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
-    llvm::Instruction *VoidPtr =
+    llvm::CallBase *VoidPtr =
         CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                                 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
                             AllocArgs, VD->getName());
+    // FIXME: We should use the variables actual alignment as an argument.
+    VoidPtr->addRetAttr(llvm::Attribute::get(
+        CGM.getLLVMContext(), llvm::Attribute::Alignment,
+        CGM.getContext().getTargetInfo().getNewAlign() / 8));
 
     // Cast the void pointer and get the address of the globalized variable.
     llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
@@ -1438,10 +1442,13 @@
 
     // Allocate space for this VLA object to be globalized.
     llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
-    llvm::Instruction *VoidPtr =
+    llvm::CallBase *VoidPtr =
         CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                                 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
                             AllocArgs, VD->getName());
+    VoidPtr->addRetAttr(
+        llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
+                             CGM.getContext().getTargetInfo().getNewAlign()));
 
     I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
         std::pair<llvm::Value *, llvm::Value *>(
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -182,6 +182,7 @@
 
   struct saved_type {
     DominatingLLVMValue::saved_type SavedValue;
+    llvm::Type *ElementType;
     CharUnits Alignment;
   };
 
@@ -190,11 +191,11 @@
   }
   static saved_type save(CodeGenFunction &CGF, type value) {
     return { DominatingLLVMValue::save(CGF, value.getPointer()),
-             value.getAlignment() };
+             value.getElementType(), value.getAlignment() };
   }
   static type restore(CodeGenFunction &CGF, saved_type value) {
     return Address(DominatingLLVMValue::restore(CGF, value.SavedValue),
-                   value.Alignment);
+                   value.ElementType, value.Alignment);
   }
 };
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1595,9 +1595,9 @@
   if (!InstrumentRegions || !isInstrumentedCondition(Cond))
     return EmitBranchOnBoolExpr(Cond, TrueBlock, FalseBlock, TrueCount, LH);
 
-  llvm::BasicBlock *ThenBlock = NULL;
-  llvm::BasicBlock *ElseBlock = NULL;
-  llvm::BasicBlock *NextBlock = NULL;
+  llvm::BasicBlock *ThenBlock = nullptr;
+  llvm::BasicBlock *ElseBlock = nullptr;
+  llvm::BasicBlock *NextBlock = nullptr;
 
   // Create the block we'll use to increment the appropriate counter.
   llvm::BasicBlock *CounterIncrBlock = createBasicBlock("lop.rhscnt");
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -697,8 +697,8 @@
   CharUnits VTablePtrAlign =
     CGF.CGM.getDynamicOffsetAlignment(ThisAddr.getAlignment(), RD,
                                       CGF.getPointerAlign());
-  llvm::Value *VTable =
-    CGF.GetVTablePtr(Address(This, VTablePtrAlign), VTableTy, RD);
+  llvm::Value *VTable = CGF.GetVTablePtr(
+      Address(This, ThisAddr.getElementType(), VTablePtrAlign), VTableTy, RD);
 
   // Apply the offset.
   // On ARM64, to reserve extra space in virtual member function pointers,
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -8693,7 +8693,7 @@
                             llvm::ConstantInt::get(CGF.Int32Ty, ArgSize),
                             "__new_saved_reg_area_pointer");
 
-  llvm::Value *UsingStack = 0;
+  llvm::Value *UsingStack = nullptr;
   UsingStack = CGF.Builder.CreateICmpSGT(__new_saved_reg_area_pointer,
                                          __saved_reg_area_end_pointer);
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -437,7 +437,7 @@
 
   // Enforce -static if -miamcu is present.
   if (Args.hasFlag(options::OPT_miamcu, options::OPT_mno_iamcu, false))
-    DAL->AddFlagArg(0, Opts.getOption(options::OPT_static));
+    DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_static));
 
 // Add a default value of -mlinker-version=, if one was given and the user
 // didn't specify one.
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3217,9 +3217,7 @@
         return;
       }
       // Check whether the target subarch supports the hardware TLS register
-      if (arm::getARMSubArchVersionNumber(EffectiveTriple) < 7 &&
-          llvm::ARM::parseArch(EffectiveTriple.getArchName()) !=
-              llvm::ARM::ArchKind::ARMV6T2) {
+      if (!arm::isHardTPSupported(EffectiveTriple)) {
         D.Diag(diag::err_target_unsupported_tp_hard)
             << EffectiveTriple.getArchName();
         return;
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -23,8 +23,6 @@
 using namespace clang;
 using namespace llvm::opt;
 
-using clang::driver::tools::AddLinkerInputs;
-
 void tools::PS4cpu::addProfileRTArgs(const ToolChain &TC, const ArgList &Args,
                                      ArgStringList &CmdArgs) {
   if ((Args.hasFlag(options::OPT_fprofile_arcs, options::OPT_fno_profile_arcs,
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -76,7 +76,7 @@
   ToolChain.AddFilePathLibArgs(Args, CmdArgs);
 
   const char *Crt1 = "crt1.o";
-  const char *Entry = NULL;
+  const char *Entry = nullptr;
 
   // If crt1-command.o exists, it supports new-style commands, so use it.
   // Otherwise, use the old crt1.o. This is a temporary transition measure.
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3181,6 +3181,8 @@
     ".clang-format file located in one of the parent\n"
     "directories of the source file (or current\n"
     "directory for stdin).\n"
+    "Use -style=file:<format_file_path> to explicitly specify"
+    "the configuration file.\n"
     "Use -style=\"{key: value, ...}\" to set specific\n"
     "parameters, e.g.:\n"
     "  -style=\"{BasedOnStyle: llvm, IndentWidth: 8}\"";
@@ -3233,6 +3235,18 @@
 
 const char *DefaultFallbackStyle = "LLVM";
 
+llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
+loadAndParseConfigFile(StringRef ConfigFile, llvm::vfs::FileSystem *FS,
+                       FormatStyle *Style, bool AllowUnknownOptions) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+      FS->getBufferForFile(ConfigFile.str());
+  if (auto EC = Text.getError())
+    return EC;
+  if (auto EC = parseConfiguration(*Text.get(), Style, AllowUnknownOptions))
+    return EC;
+  return Text;
+}
+
 llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
                                      StringRef FallbackStyleName,
                                      StringRef Code, llvm::vfs::FileSystem *FS,
@@ -3263,6 +3277,28 @@
       return Style;
   }
 
+  // User provided clang-format file using -style=file:path/to/format/file.
+  if (!Style.InheritsParentConfig &&
+      StyleName.startswith_insensitive("file:")) {
+    auto ConfigFile = StyleName.substr(5);
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+        loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions);
+    if (auto EC = Text.getError())
+      return make_string_error("Error reading " + ConfigFile + ": " +
+                               EC.message());
+
+    LLVM_DEBUG(llvm::dbgs()
+               << "Using configuration file " << ConfigFile << "\n");
+
+    if (!Style.InheritsParentConfig)
+      return Style;
+
+    // Search for parent configs starting from the parent directory of
+    // ConfigFile.
+    FileName = ConfigFile;
+    ChildFormatTextToApply.emplace_back(std::move(*Text));
+  }
+
   // If the style inherits the parent configuration it is a command line
   // configuration, which wants to inherit, so we have to skip the check of the
   // StyleName.
@@ -3308,19 +3344,16 @@
       if (Status &&
           (Status->getType() == llvm::sys::fs::file_type::regular_file)) {
         llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
-            FS->getBufferForFile(ConfigFile.str());
-        if (std::error_code EC = Text.getError())
-          return make_string_error(EC.message());
-        if (std::error_code ec =
-                parseConfiguration(*Text.get(), &Style, AllowUnknownOptions)) {
-          if (ec == ParseError::Unsuitable) {
+            loadAndParseConfigFile(ConfigFile, FS, &Style, AllowUnknownOptions);
+        if (auto EC = Text.getError()) {
+          if (EC == ParseError::Unsuitable) {
             if (!UnsuitableConfigFiles.empty())
               UnsuitableConfigFiles.append(", ");
             UnsuitableConfigFiles.append(ConfigFile);
             continue;
           }
           return make_string_error("Error reading " + ConfigFile + ": " +
-                                   ec.message());
+                                   EC.message());
         }
         LLVM_DEBUG(llvm::dbgs()
                    << "Using configuration file " << ConfigFile << "\n");
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -438,7 +438,7 @@
     }(EXTRACTOR(KEYPATH));                                                     \
   }
 
-static const StringRef GetInputKindName(InputKind IK);
+static StringRef GetInputKindName(InputKind IK);
 
 static bool FixupInvocation(CompilerInvocation &Invocation,
                             DiagnosticsEngine &Diags, const ArgList &Args,
@@ -3291,7 +3291,7 @@
 }
 
 /// Get language name for given input kind.
-static const StringRef GetInputKindName(InputKind IK) {
+static StringRef GetInputKindName(InputKind IK) {
   switch (IK.getLanguage()) {
   case Language::C:
     return "C";
diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h
--- a/clang/lib/Headers/opencl-c-base.h
+++ b/clang/lib/Headers/opencl-c-base.h
@@ -600,9 +600,11 @@
 // C++ for OpenCL - __remove_address_space
 #if defined(__OPENCL_CPP_VERSION__)
 template <typename _Tp> struct __remove_address_space { using type = _Tp; };
+#if defined(__opencl_c_generic_address_space)
 template <typename _Tp> struct __remove_address_space<__generic _Tp> {
   using type = _Tp;
 };
+#endif
 template <typename _Tp> struct __remove_address_space<__global _Tp> {
   using type = _Tp;
 };
diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp
--- a/clang/lib/Interpreter/IncrementalParser.cpp
+++ b/clang/lib/Interpreter/IncrementalParser.cpp
@@ -256,7 +256,7 @@
                                /*LoadedOffset=*/0, NewLoc);
 
   // NewLoc only used for diags.
-  if (PP.EnterSourceFile(FID, /*DirLookup=*/0, NewLoc))
+  if (PP.EnterSourceFile(FID, /*DirLookup=*/nullptr, NewLoc))
     return llvm::make_error<llvm::StringError>("Parsing failed. "
                                                "Cannot enter source file.",
                                                std::error_code());
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2548,9 +2548,9 @@
   assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
 
   // Position of the first trigraph in the ending sequence.
-  const char *TrigraphPos = 0;
+  const char *TrigraphPos = nullptr;
   // Position of the first whitespace after a '\' in the ending sequence.
-  const char *SpacePos = 0;
+  const char *SpacePos = nullptr;
 
   while (true) {
     // Back up off the newline.
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/UniqueVector.h"
+#include "llvm/Frontend/OpenMP/OMPAssume.h"
 #include "llvm/Frontend/OpenMP/OMPContext.h"
 
 using namespace clang;
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1858,6 +1858,15 @@
   if (isUnevaluatedContext() || Ty.isNull())
     return;
 
+  // The original idea behind checkTypeSupport function is that unused
+  // declarations can be replaced with an array of bytes of the same size during
+  // codegen, such replacement doesn't seem to be possible for types without
+  // constant byte size like zero length arrays. So, do a deep check for SYCL.
+  if (D && LangOpts.SYCLIsDevice) {
+    llvm::DenseSet<QualType> Visited;
+    deepTypeCheckForSYCLDevice(Loc, Visited, D);
+  }
+
   Decl *C = cast<Decl>(getCurLexicalContext());
 
   // Memcpy operations for structs containing a member with unsupported type
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -55,7 +55,6 @@
 
 using namespace clang;
 using namespace sema;
-using llvm::RoundingMode;
 
 /// Determine whether the use of this declaration is valid, without
 /// emitting diagnostics.
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -36,6 +36,7 @@
 #include "llvm/ADT/PointerEmbeddedInt.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Frontend/OpenMP/OMPAssume.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include <set>
 
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14322,8 +14322,7 @@
     FoundDecl = MemExpr->getFoundDecl();
     Qualifier = MemExpr->getQualifier();
     UnbridgedCasts.restore();
-  } else {
-    UnresolvedMemberExpr *UnresExpr = cast<UnresolvedMemberExpr>(NakedMemExpr);
+  } else if (auto *UnresExpr = dyn_cast<UnresolvedMemberExpr>(NakedMemExpr)) {
     Qualifier = UnresExpr->getQualifier();
 
     QualType ObjectType = UnresExpr->getBaseType();
@@ -14436,7 +14435,9 @@
     }
 
     MemExpr = cast<MemberExpr>(MemExprE->IgnoreParens());
-  }
+  } else
+    // Unimaged NakedMemExpr type.
+    return ExprError();
 
   QualType ResultType = Method->getReturnType();
   ExprValueKind VK = Expr::getValueKindForType(ResultType);
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -48,3 +48,101 @@
   return DiagKind != SemaDiagnosticBuilder::K_Immediate &&
          DiagKind != SemaDiagnosticBuilder::K_ImmediateWithCallStack;
 }
+
+static bool isZeroSizedArray(Sema &SemaRef, QualType Ty) {
+  if (const auto *CAT = SemaRef.getASTContext().getAsConstantArrayType(Ty))
+    return CAT->getSize() == 0;
+  return false;
+}
+
+void Sema::deepTypeCheckForSYCLDevice(SourceLocation UsedAt,
+                                      llvm::DenseSet<QualType> Visited,
+                                      ValueDecl *DeclToCheck) {
+  assert(getLangOpts().SYCLIsDevice &&
+         "Should only be called during SYCL compilation");
+  // Emit notes only for the first discovered declaration of unsupported type
+  // to avoid mess of notes. This flag is to track that error already happened.
+  bool NeedToEmitNotes = true;
+
+  auto Check = [&](QualType TypeToCheck, const ValueDecl *D) {
+    bool ErrorFound = false;
+    if (isZeroSizedArray(*this, TypeToCheck)) {
+      SYCLDiagIfDeviceCode(UsedAt, diag::err_typecheck_zero_array_size) << 1;
+      ErrorFound = true;
+    }
+    // Checks for other types can also be done here.
+    if (ErrorFound) {
+      if (NeedToEmitNotes) {
+        if (auto *FD = dyn_cast<FieldDecl>(D))
+          SYCLDiagIfDeviceCode(FD->getLocation(),
+                               diag::note_illegal_field_declared_here)
+              << FD->getType()->isPointerType() << FD->getType();
+        else
+          SYCLDiagIfDeviceCode(D->getLocation(), diag::note_declared_at);
+      }
+    }
+
+    return ErrorFound;
+  };
+
+  // In case we have a Record used do the DFS for a bad field.
+  SmallVector<const ValueDecl *, 4> StackForRecursion;
+  StackForRecursion.push_back(DeclToCheck);
+
+  // While doing DFS save how we get there to emit a nice set of notes.
+  SmallVector<const FieldDecl *, 4> History;
+  History.push_back(nullptr);
+
+  do {
+    const ValueDecl *Next = StackForRecursion.pop_back_val();
+    if (!Next) {
+      assert(!History.empty());
+      // Found a marker, we have gone up a level.
+      History.pop_back();
+      continue;
+    }
+    QualType NextTy = Next->getType();
+
+    if (!Visited.insert(NextTy).second)
+      continue;
+
+    auto EmitHistory = [&]() {
+      // The first element is always nullptr.
+      for (uint64_t Index = 1; Index < History.size(); ++Index) {
+        SYCLDiagIfDeviceCode(History[Index]->getLocation(),
+                             diag::note_within_field_of_type)
+            << History[Index]->getType();
+      }
+    };
+
+    if (Check(NextTy, Next)) {
+      if (NeedToEmitNotes)
+        EmitHistory();
+      NeedToEmitNotes = false;
+    }
+
+    // In case pointer/array/reference type is met get pointee type, then
+    // proceed with that type.
+    while (NextTy->isAnyPointerType() || NextTy->isArrayType() ||
+           NextTy->isReferenceType()) {
+      if (NextTy->isArrayType())
+        NextTy = QualType{NextTy->getArrayElementTypeNoTypeQual(), 0};
+      else
+        NextTy = NextTy->getPointeeType();
+      if (Check(NextTy, Next)) {
+        if (NeedToEmitNotes)
+          EmitHistory();
+        NeedToEmitNotes = false;
+      }
+    }
+
+    if (const auto *RecDecl = NextTy->getAsRecordDecl()) {
+      if (auto *NextFD = dyn_cast<FieldDecl>(Next))
+        History.push_back(NextFD);
+      // When nullptr is discovered, this means we've gone back up a level, so
+      // the history should be cleaned.
+      StackForRecursion.push_back(nullptr);
+      llvm::copy(RecDecl->fields(), std::back_inserter(StackForRecursion));
+    }
+  } while (!StackForRecursion.empty());
+}
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2515,7 +2515,7 @@
         Diag(ArraySize->getBeginLoc(),
              isSFINAEContext() ? diag::err_typecheck_zero_array_size
                                : diag::ext_typecheck_zero_array_size)
-            << ArraySize->getSourceRange();
+            << 0 << ArraySize->getSourceRange();
       }
 
       // Is the array too large?
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -142,7 +142,6 @@
 using namespace clang::serialization;
 using namespace clang::serialization::reader;
 using llvm::BitstreamCursor;
-using llvm::RoundingMode;
 
 //===----------------------------------------------------------------------===//
 // ChainedASTReaderListener implementation
diff --git a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
--- a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
@@ -23,7 +23,6 @@
 
 using namespace clang;
 using namespace ento;
-using llvm::APSInt;
 
 namespace {
 class MmapWriteExecChecker : public Checker<check::PreCall> {
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -2804,7 +2804,8 @@
       Out << '\''
           << Lexer::getSourceText(
                  CharSourceRange::getTokenRange(Ex->getSourceRange()),
-                 BRC.getSourceManager(), BRC.getASTContext().getLangOpts(), 0)
+                 BRC.getSourceManager(), BRC.getASTContext().getLangOpts(),
+                 nullptr)
           << '\'';
   }
 
diff --git a/clang/lib/Tooling/Transformer/Parsing.cpp b/clang/lib/Tooling/Transformer/Parsing.cpp
--- a/clang/lib/Tooling/Transformer/Parsing.cpp
+++ b/clang/lib/Tooling/Transformer/Parsing.cpp
@@ -33,7 +33,6 @@
 // much as possible with the AST Matchers parsing.
 
 namespace {
-using llvm::Error;
 using llvm::Expected;
 
 template <typename... Ts> using RangeSelectorOp = RangeSelector (*)(Ts...);
diff --git a/clang/test/OpenMP/barrier_codegen.cpp b/clang/test/OpenMP/barrier_codegen.cpp
--- a/clang/test/OpenMP/barrier_codegen.cpp
+++ b/clang/test/OpenMP/barrier_codegen.cpp
@@ -19,8 +19,8 @@
 #define HEADER
 
 // CHECK: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
-// CHECK-DAG: [[EXPLICIT_BARRIER_LOC:@.+]] = {{.+}} [[IDENT_T]] { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @{{.+}}, i32 0, i32 0) }
-// CHECK-DAG: [[LOC:@.+]] = {{.+}} [[IDENT_T]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @{{.+}}, i32 0, i32 0) }
+// CHECK-DAG: [[EXPLICIT_BARRIER_LOC:@.+]] = {{.+}} [[IDENT_T]] { i32 0, i32 34, i32 0, i32 {{[0-9]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @{{.+}}, i32 0, i32 0) }
+// CHECK-DAG: [[LOC:@.+]] = {{.+}} [[IDENT_T]] { i32 0, i32 2, i32 0, i32 {{[0-9]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @{{.+}}, i32 0, i32 0) }
 
 void foo() {}
 
diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp
--- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp
+++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp
@@ -58,8 +58,8 @@
 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR4:[0-9]+]]
-// CHECK1-NEXT:    [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR4]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR6:[0-9]+]]
+// CHECK1-NEXT:    [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR6]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[TMP0]], align 4
 // CHECK1-NEXT:    ret void
@@ -78,9 +78,9 @@
 // CHECK1-LABEL: define {{[^@]+}}@_Z3barv
 // CHECK1-SAME: () #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[A:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
-// CHECK1-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR6]]
 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[A]], i64 4)
 // CHECK1-NEXT:    ret i32 [[CALL]]
 //
diff --git a/clang/test/OpenMP/for_codegen.cpp b/clang/test/OpenMP/for_codegen.cpp
--- a/clang/test/OpenMP/for_codegen.cpp
+++ b/clang/test/OpenMP/for_codegen.cpp
@@ -22,8 +22,8 @@
 // PROF-INSTR-PATH: constant [25 x i8] c"for_codegen-test.profraw\00"
 
 // CHECK: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
-// CHECK-DAG: [[LOOP_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 514, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 {{[0-9]+}}, i8*
+// CHECK-DAG: [[LOOP_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 514, i32 0, i32 {{[0-9]+}}, i8*
 // CHECK-DAG: [[I:@.+]] ={{.*}} global i8 1,
 // CHECK-DAG: [[J:@.+]] ={{.*}} global i8 2,
 // CHECK-DAG: [[K:@.+]] ={{.*}} global i8 3,
diff --git a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
--- a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
@@ -11,13 +11,13 @@
 int a;
 
 // CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
-// CHECK-DAG: [[DISTR_LIGHT:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 3, i32 0, i8* getelementptr inbounds
-// CHECK-DAG: [[FOR_LIGHT:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 3, i32 0, i8* getelementptr inbounds
-// CHECK-DAG: [[LIGHT:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 3, i32 0, i8* getelementptr inbounds
-// CHECK-DAG: [[DISTR_FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 1, i32 0, i8* getelementptr inbounds
-// CHECK-DAG: [[FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds
-// CHECK-DAG: [[BAR_LIGHT:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 3, i32 0, i8* getelementptr inbounds
-// CHECK-DAG: [[BAR_FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 1, i32 0, i8* getelementptr inbounds
+// CHECK-DAG: [[DISTR_LIGHT:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 3, i32 {{[0-9]+}}, i8* getelementptr inbounds
+// CHECK-DAG: [[FOR_LIGHT:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 3, i32 {{[0-9]+}}, i8* getelementptr inbounds
+// CHECK-DAG: [[LIGHT:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 3, i32 {{[0-9]+}}, i8* getelementptr inbounds
+// CHECK-DAG: [[DISTR_FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 1, i32 {{[0-9]+}}, i8* getelementptr inbounds
+// CHECK-DAG: [[FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 {{[0-9]+}}, i8* getelementptr inbounds
+// CHECK-DAG: [[BAR_LIGHT:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 3, i32 {{[0-9]+}}, i8* getelementptr inbounds
+// CHECK-DAG: [[BAR_FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 1, i32 {{[0-9]+}}, i8* getelementptr inbounds
 // CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
 
 void foo() {
diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp
--- a/clang/test/OpenMP/nvptx_data_sharing.cpp
+++ b/clang/test/OpenMP/nvptx_data_sharing.cpp
@@ -397,9 +397,9 @@
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
-// CHECK-NEXT:    [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK-NEXT:    [[A:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
-// CHECK-NEXT:    [[B:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK-NEXT:    [[B:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK-NEXT:    [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32*
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK-NEXT:    store i32 10, i32* [[A_ON_STACK]], align 4
diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
--- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -3046,7 +3046,7 @@
 // CHECK4-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK4-NEXT:    [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
 // CHECK4-NEXT:    [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8
-// CHECK4-NEXT:    [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 40)
+// CHECK4-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 40)
 // CHECK4-NEXT:    [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]*
 // CHECK4-NEXT:    [[TMP4:%.*]] = load i32, i32* [[CONV]], align 4
 // CHECK4-NEXT:    store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4
@@ -3377,7 +3377,7 @@
 // CHECK5-NEXT:    [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4
 // CHECK5-NEXT:    [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4
-// CHECK5-NEXT:    [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40)
+// CHECK5-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 40)
 // CHECK5-NEXT:    [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]*
 // CHECK5-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
 // CHECK5-NEXT:    store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4
@@ -3700,7 +3700,7 @@
 // CHECK6-NEXT:    [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4
 // CHECK6-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4
 // CHECK6-NEXT:    [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4
-// CHECK6-NEXT:    [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40)
+// CHECK6-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 40)
 // CHECK6-NEXT:    [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]*
 // CHECK6-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
 // CHECK6-NEXT:    store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -1633,7 +1633,7 @@
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
-// CHECK1-NEXT:    [[A1:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[A1:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -1905,7 +1905,7 @@
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[A1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK2-NEXT:    [[A1:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4
 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
--- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
@@ -469,7 +469,7 @@
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
-// CHECK-NEXT:    [[D:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK-NEXT:    [[D:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D]] to i32*
 // CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_codegen.cpp
@@ -418,7 +418,7 @@
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
-// CHECK1-NEXT:    [[F:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[F:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
 // CHECK1-NEXT:    store i32 [[F1]], i32* [[F_ON_STACK]], align 4
 // CHECK1-NEXT:    store double* [[A]], double** [[A_ADDR]], align 8
@@ -802,7 +802,7 @@
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
-// CHECK2-NEXT:    [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK2-NEXT:    [[F:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
 // CHECK2-NEXT:    store i32 [[F1]], i32* [[F_ON_STACK]], align 4
 // CHECK2-NEXT:    store double* [[A]], double** [[A_ADDR]], align 4
@@ -1185,7 +1185,7 @@
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 4
 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
-// CHECK3-NEXT:    [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK3-NEXT:    [[F:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK3-NEXT:    [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
 // CHECK3-NEXT:    store i32 [[F1]], i32* [[F_ON_STACK]], align 4
 // CHECK3-NEXT:    store double* [[A]], double** [[A_ADDR]], align 4
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -66,11 +66,11 @@
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12:![0-9]+]]
 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32*
-// CHECK1-NEXT:    [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[IEND:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32*
-// CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
+// CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex"*
 // CHECK1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]]
@@ -123,7 +123,7 @@
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast float* [[REF_TMP2]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP15]]) #[[ATTR5]]
 // CHECK1-NEXT:    store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR11:[0-9]+]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR10:[0-9]+]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = bitcast float* [[REF_TMP2]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP16]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = bitcast float* [[REF_TMP]] to i8*
@@ -188,7 +188,7 @@
 // CHECK1-NEXT:    [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load float*, float** [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float*, float** [[__IM_ADDR]], align 8
-// CHECK1-NEXT:    call void @_ZNSt7complexIfEC2ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[THIS1]], float* nonnull align 4 dereferenceable(4) [[TMP0]], float* nonnull align 4 dereferenceable(4) [[TMP1]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIfEC2ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[THIS1]], float* nonnull align 4 dereferenceable(4) [[TMP0]], float* nonnull align 4 dereferenceable(4) [[TMP1]]) #[[ATTR10]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -278,7 +278,7 @@
 // CHECK1-NEXT:    [[TMP23:%.*]] = bitcast float* [[REF_TMP6]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP23]]) #[[ATTR5]]
 // CHECK1-NEXT:    store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = bitcast float* [[REF_TMP6]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP24]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = bitcast float* [[REF_TMP]] to i8*
@@ -340,8 +340,8 @@
 // CHECK1-NEXT:    [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA8]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP44]] to float
 // CHECK1-NEXT:    store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR11]]
-// CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR10]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[TMP45:%.*]] = bitcast float* [[REF_TMP16]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP45]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP46:%.*]] = bitcast float* [[REF_TMP15]] to i8*
@@ -382,7 +382,7 @@
 // CHECK1-NEXT:    [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1
 // CHECK1-NEXT:    br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR11]]
+// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR10]]
 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
@@ -420,13 +420,13 @@
 // CHECK1-NEXT:    store %"class.std::complex"* [[__C]], %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR11]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
 // CHECK1-NEXT:    store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA16]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR11]]
+// CHECK1-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA18:![0-9]+]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
@@ -461,49 +461,49 @@
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
 // CHECK1-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
-// CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-// CHECK1-NEXT:    [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
-// CHECK1-NEXT:    [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
-// CHECK1-NEXT:    store i64 [[TMP19]], i64* [[TMP16]], align 4
-// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
-// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
-// CHECK1-NEXT:    [[TMP22:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP22]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
-// CHECK1-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK1-NEXT:    [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
-// CHECK1-NEXT:    [[TMP28:%.*]] = and i16 [[TMP6]], 1
-// CHECK1-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
-// CHECK1-NEXT:    [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
-// CHECK1-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
-// CHECK1-NEXT:    [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
-// CHECK1-NEXT:    br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK1-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
+// CHECK1-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
+// CHECK1-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
+// CHECK1-NEXT:    [[TMP23:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK1-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK1-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK1-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK1-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
+// CHECK1-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
+// CHECK1-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK1-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
+// CHECK1-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
-// CHECK1-NEXT:    [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
-// CHECK1-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
-// CHECK1-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK1-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK1-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR5]]
 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
-// CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK1-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
-// CHECK1-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
-// CHECK1-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK1-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK1-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK1-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK1:       then4:
-// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8
-// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
-// CHECK1-NEXT:    [[TMP44:%.*]] = bitcast i8* [[TMP41]] to %"class.std::complex"*
-// CHECK1-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP43]] to %"class.std::complex"*
-// CHECK1-NEXT:    [[TMP46:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
-// CHECK1-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP44]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !21
+// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
+// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to %"class.std::complex"*
+// CHECK1-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to %"class.std::complex"*
+// CHECK1-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP46]] to i8*
+// CHECK1-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 8, i1 false), !tbaa.struct !21
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
 // CHECK1-NEXT:    br label [[IFCONT6]]
@@ -520,53 +520,53 @@
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
-// CHECK1-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
 // CHECK1-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
-// CHECK1-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
+// CHECK1-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
-// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
-// CHECK1-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK1-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
-// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK1:       then2:
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
-// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK1:       else3:
-// CHECK1-NEXT:    br label [[IFCONT6]]
+// CHECK1-NEXT:    br label [[IFCONT4]]
 // CHECK1:       ifcont4:
-// CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK1-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
@@ -633,11 +633,11 @@
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32*
-// CHECK1-NEXT:    [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[IEND:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32*
-// CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 16)
+// CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 16)
 // CHECK1-NEXT:    [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex.0"*
 // CHECK1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]]
@@ -690,7 +690,7 @@
 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast double* [[REF_TMP2]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP15]]) #[[ATTR5]]
 // CHECK1-NEXT:    store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA22]]
-// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = bitcast double* [[REF_TMP2]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP16]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = bitcast double* [[REF_TMP]] to i8*
@@ -755,7 +755,7 @@
 // CHECK1-NEXT:    [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load double*, double** [[__IM_ADDR]], align 8
-// CHECK1-NEXT:    call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR10]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -845,7 +845,7 @@
 // CHECK1-NEXT:    [[TMP23:%.*]] = bitcast double* [[REF_TMP6]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP23]]) #[[ATTR5]]
 // CHECK1-NEXT:    store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA22]]
-// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = bitcast double* [[REF_TMP6]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP24]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = bitcast double* [[REF_TMP]] to i8*
@@ -907,8 +907,8 @@
 // CHECK1-NEXT:    [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA8]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP44]] to double
 // CHECK1-NEXT:    store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA22]]
-// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR11]]
-// CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR10]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[TMP45:%.*]] = bitcast double* [[REF_TMP16]] to i8*
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP45]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP46:%.*]] = bitcast double* [[REF_TMP15]] to i8*
@@ -949,7 +949,7 @@
 // CHECK1-NEXT:    [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1
 // CHECK1-NEXT:    br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR11]]
+// CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR10]]
 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
@@ -987,13 +987,13 @@
 // CHECK1-NEXT:    store %"class.std::complex.0"* [[__C]], %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR11]]
+// CHECK1-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
 // CHECK1-NEXT:    store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA24]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR11]]
+// CHECK1-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR10]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26:![0-9]+]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
@@ -1029,8 +1029,8 @@
 // CHECK1-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
 // CHECK1:       .shuffle.pre_cond:
-// CHECK1-NEXT:    [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP28:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
-// CHECK1-NEXT:    [[TMP18:%.*]] = phi i64* [ [[TMP16]], [[ENTRY]] ], [ [[TMP29:%.*]], [[DOTSHUFFLE_THEN]] ]
+// CHECK1-NEXT:    [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP29:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
+// CHECK1-NEXT:    [[TMP18:%.*]] = phi i64* [ [[TMP16]], [[ENTRY]] ], [ [[TMP30:%.*]], [[DOTSHUFFLE_THEN]] ]
 // CHECK1-NEXT:    [[TMP19:%.*]] = bitcast i64* [[TMP17]] to i8*
 // CHECK1-NEXT:    [[TMP20:%.*]] = ptrtoint i8* [[TMP14]] to i64
 // CHECK1-NEXT:    [[TMP21:%.*]] = ptrtoint i8* [[TMP19]] to i64
@@ -1040,51 +1040,51 @@
 // CHECK1-NEXT:    br i1 [[TMP24]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
 // CHECK1:       .shuffle.then:
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i64, i64* [[TMP17]], align 8
-// CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-// CHECK1-NEXT:    [[TMP26:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
-// CHECK1-NEXT:    [[TMP27:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP26]])
-// CHECK1-NEXT:    store i64 [[TMP27]], i64* [[TMP18]], align 8
-// CHECK1-NEXT:    [[TMP28]] = getelementptr i64, i64* [[TMP17]], i64 1
-// CHECK1-NEXT:    [[TMP29]] = getelementptr i64, i64* [[TMP18]], i64 1
+// CHECK1-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK1-NEXT:    [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
+// CHECK1-NEXT:    [[TMP28:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP27]])
+// CHECK1-NEXT:    store i64 [[TMP28]], i64* [[TMP18]], align 8
+// CHECK1-NEXT:    [[TMP29]] = getelementptr i64, i64* [[TMP17]], i64 1
+// CHECK1-NEXT:    [[TMP30]] = getelementptr i64, i64* [[TMP18]], i64 1
 // CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK1:       .shuffle.exit:
-// CHECK1-NEXT:    [[TMP30:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK1-NEXT:    store i8* [[TMP30]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP8]], 0
-// CHECK1-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK1-NEXT:    [[TMP33:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
-// CHECK1-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
-// CHECK1-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 2
-// CHECK1-NEXT:    [[TMP36:%.*]] = and i16 [[TMP6]], 1
-// CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP36]], 0
-// CHECK1-NEXT:    [[TMP38:%.*]] = and i1 [[TMP35]], [[TMP37]]
-// CHECK1-NEXT:    [[TMP39:%.*]] = icmp sgt i16 [[TMP7]], 0
-// CHECK1-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
-// CHECK1-NEXT:    [[TMP41:%.*]] = or i1 [[TMP31]], [[TMP34]]
-// CHECK1-NEXT:    [[TMP42:%.*]] = or i1 [[TMP41]], [[TMP40]]
-// CHECK1-NEXT:    br i1 [[TMP42]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK1-NEXT:    store i8* [[TMP31]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK1-NEXT:    [[TMP33:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK1-NEXT:    [[TMP34:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK1-NEXT:    [[TMP37:%.*]] = and i16 [[TMP6]], 1
+// CHECK1-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP37]], 0
+// CHECK1-NEXT:    [[TMP39:%.*]] = and i1 [[TMP36]], [[TMP38]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK1-NEXT:    [[TMP41:%.*]] = and i1 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT:    [[TMP42:%.*]] = or i1 [[TMP32]], [[TMP35]]
+// CHECK1-NEXT:    [[TMP43:%.*]] = or i1 [[TMP42]], [[TMP41]]
+// CHECK1-NEXT:    br i1 [[TMP43]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
-// CHECK1-NEXT:    [[TMP43:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
-// CHECK1-NEXT:    [[TMP44:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
-// CHECK1-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP44:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK1-NEXT:    [[TMP45:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK1-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP44]], i8* [[TMP45]]) #[[ATTR5]]
 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
-// CHECK1-NEXT:    [[TMP45:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK1-NEXT:    [[TMP46:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
-// CHECK1-NEXT:    [[TMP47:%.*]] = and i1 [[TMP45]], [[TMP46]]
-// CHECK1-NEXT:    br i1 [[TMP47]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK1-NEXT:    [[TMP46:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK1-NEXT:    [[TMP47:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT:    [[TMP48:%.*]] = and i1 [[TMP46]], [[TMP47]]
+// CHECK1-NEXT:    br i1 [[TMP48]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK1:       then4:
-// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP49:%.*]] = load i8*, i8** [[TMP48]], align 8
-// CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP51:%.*]] = load i8*, i8** [[TMP50]], align 8
-// CHECK1-NEXT:    [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.0"*
-// CHECK1-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.0"*
-// CHECK1-NEXT:    [[TMP54:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
-// CHECK1-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP52]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !27
+// CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP50:%.*]] = load i8*, i8** [[TMP49]], align 8
+// CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8
+// CHECK1-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP50]] to %"class.std::complex.0"*
+// CHECK1-NEXT:    [[TMP54:%.*]] = bitcast i8* [[TMP52]] to %"class.std::complex.0"*
+// CHECK1-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP54]] to i8*
+// CHECK1-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 16, i1 false), !tbaa.struct !27
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
 // CHECK1-NEXT:    br label [[IFCONT6]]
@@ -1101,53 +1101,53 @@
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
-// CHECK1-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
 // CHECK1-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 4
-// CHECK1-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 4
+// CHECK1-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
-// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
-// CHECK1-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK1-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
 // CHECK1:       else:
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
-// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK1:       then2:
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
-// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK1:       else3:
-// CHECK1-NEXT:    br label [[IFCONT6]]
+// CHECK1-NEXT:    br label [[IFCONT4]]
 // CHECK1:       ifcont4:
-// CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK1-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
@@ -1300,11 +1300,11 @@
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12:![0-9]+]]
 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK2-NEXT:    [[ISTART:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK2-NEXT:    [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32*
-// CHECK2-NEXT:    [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK2-NEXT:    [[IEND:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK2-NEXT:    [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32*
-// CHECK2-NEXT:    [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
+// CHECK2-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 8)
 // CHECK2-NEXT:    [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex"*
 // CHECK2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]]
@@ -1357,7 +1357,7 @@
 // CHECK2-NEXT:    [[TMP15:%.*]] = bitcast float* [[REF_TMP2]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP15]]) #[[ATTR5]]
 // CHECK2-NEXT:    store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA14]]
-// CHECK2-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR11:[0-9]+]]
+// CHECK2-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR10:[0-9]+]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = bitcast float* [[REF_TMP2]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP16]]) #[[ATTR5]]
 // CHECK2-NEXT:    [[TMP17:%.*]] = bitcast float* [[REF_TMP]] to i8*
@@ -1422,7 +1422,7 @@
 // CHECK2-NEXT:    [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load float*, float** [[__RE_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP1:%.*]] = load float*, float** [[__IM_ADDR]], align 8
-// CHECK2-NEXT:    call void @_ZNSt7complexIfEC2ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[THIS1]], float* nonnull align 4 dereferenceable(4) [[TMP0]], float* nonnull align 4 dereferenceable(4) [[TMP1]]) #[[ATTR11]]
+// CHECK2-NEXT:    call void @_ZNSt7complexIfEC2ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[THIS1]], float* nonnull align 4 dereferenceable(4) [[TMP0]], float* nonnull align 4 dereferenceable(4) [[TMP1]]) #[[ATTR10]]
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1512,7 +1512,7 @@
 // CHECK2-NEXT:    [[TMP23:%.*]] = bitcast float* [[REF_TMP6]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP23]]) #[[ATTR5]]
 // CHECK2-NEXT:    store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA14]]
-// CHECK2-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR11]]
+// CHECK2-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[TMP24:%.*]] = bitcast float* [[REF_TMP6]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP24]]) #[[ATTR5]]
 // CHECK2-NEXT:    [[TMP25:%.*]] = bitcast float* [[REF_TMP]] to i8*
@@ -1574,8 +1574,8 @@
 // CHECK2-NEXT:    [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA8]]
 // CHECK2-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP44]] to float
 // CHECK2-NEXT:    store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA14]]
-// CHECK2-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR11]]
-// CHECK2-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR11]]
+// CHECK2-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR10]]
+// CHECK2-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[TMP45:%.*]] = bitcast float* [[REF_TMP16]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP45]]) #[[ATTR5]]
 // CHECK2-NEXT:    [[TMP46:%.*]] = bitcast float* [[REF_TMP15]] to i8*
@@ -1616,7 +1616,7 @@
 // CHECK2-NEXT:    [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1
 // CHECK2-NEXT:    br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK2:       .omp.reduction.then:
-// CHECK2-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR11]]
+// CHECK2-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR10]]
 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK2:       .omp.reduction.done:
@@ -1654,13 +1654,13 @@
 // CHECK2-NEXT:    store %"class.std::complex"* [[__C]], %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK2-NEXT:    [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR11]]
+// CHECK2-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // CHECK2-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
 // CHECK2-NEXT:    store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA16]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR11]]
+// CHECK2-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA18:![0-9]+]]
 // CHECK2-NEXT:    [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
@@ -1695,49 +1695,49 @@
 // CHECK2-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
 // CHECK2-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
-// CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-// CHECK2-NEXT:    [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
-// CHECK2-NEXT:    [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
-// CHECK2-NEXT:    store i64 [[TMP19]], i64* [[TMP16]], align 4
-// CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
-// CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
-// CHECK2-NEXT:    [[TMP22:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP22]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
-// CHECK2-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK2-NEXT:    [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
-// CHECK2-NEXT:    [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
-// CHECK2-NEXT:    [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
-// CHECK2-NEXT:    [[TMP28:%.*]] = and i16 [[TMP6]], 1
-// CHECK2-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
-// CHECK2-NEXT:    [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
-// CHECK2-NEXT:    [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
-// CHECK2-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-// CHECK2-NEXT:    [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
-// CHECK2-NEXT:    [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
-// CHECK2-NEXT:    br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK2-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
+// CHECK2-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
+// CHECK2-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 4
+// CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
+// CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
+// CHECK2-NEXT:    [[TMP23:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK2-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK2-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK2-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK2-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK2-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK2-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
+// CHECK2-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
+// CHECK2-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
+// CHECK2-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK2-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
+// CHECK2-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
+// CHECK2-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK2:       then:
-// CHECK2-NEXT:    [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
-// CHECK2-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
-// CHECK2-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR5]]
+// CHECK2-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK2-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK2-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR5]]
 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
 // CHECK2:       else:
 // CHECK2-NEXT:    br label [[IFCONT]]
 // CHECK2:       ifcont:
-// CHECK2-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK2-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
-// CHECK2-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
-// CHECK2-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK2-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK2-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK2-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK2-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK2:       then4:
-// CHECK2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8
-// CHECK2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
-// CHECK2-NEXT:    [[TMP44:%.*]] = bitcast i8* [[TMP41]] to %"class.std::complex"*
-// CHECK2-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP43]] to %"class.std::complex"*
-// CHECK2-NEXT:    [[TMP46:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
-// CHECK2-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP44]] to i8*
-// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !21
+// CHECK2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
+// CHECK2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
+// CHECK2-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to %"class.std::complex"*
+// CHECK2-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to %"class.std::complex"*
+// CHECK2-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP46]] to i8*
+// CHECK2-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 8, i1 false), !tbaa.struct !21
 // CHECK2-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK2:       else5:
 // CHECK2-NEXT:    br label [[IFCONT6]]
@@ -1754,53 +1754,53 @@
 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK2-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK2-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
-// CHECK2-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
-// CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
+// CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK2-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
 // CHECK2-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK2-NEXT:    br label [[PRECOND:%.*]]
 // CHECK2:       precond:
-// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
-// CHECK2-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
+// CHECK2-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK2:       body:
 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK2:       then:
-// CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
-// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
-// CHECK2-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK2-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
 // CHECK2:       else:
 // CHECK2-NEXT:    br label [[IFCONT]]
 // CHECK2:       ifcont:
 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
-// CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK2:       then2:
-// CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
-// CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8, !tbaa [[TBAA12]]
+// CHECK2-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK2-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK2:       else3:
-// CHECK2-NEXT:    br label [[IFCONT6]]
+// CHECK2-NEXT:    br label [[IFCONT4]]
 // CHECK2:       ifcont4:
-// CHECK2-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK2-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK2-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK2-NEXT:    br label [[PRECOND]]
 // CHECK2:       exit:
 // CHECK2-NEXT:    ret void
@@ -1867,11 +1867,11 @@
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK2-NEXT:    [[ISTART:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK2-NEXT:    [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32*
-// CHECK2-NEXT:    [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK2-NEXT:    [[IEND:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK2-NEXT:    [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32*
-// CHECK2-NEXT:    [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 16)
+// CHECK2-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 16)
 // CHECK2-NEXT:    [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex.0"*
 // CHECK2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]]
@@ -1924,7 +1924,7 @@
 // CHECK2-NEXT:    [[TMP15:%.*]] = bitcast double* [[REF_TMP2]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP15]]) #[[ATTR5]]
 // CHECK2-NEXT:    store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA22]]
-// CHECK2-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR11]]
+// CHECK2-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = bitcast double* [[REF_TMP2]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP16]]) #[[ATTR5]]
 // CHECK2-NEXT:    [[TMP17:%.*]] = bitcast double* [[REF_TMP]] to i8*
@@ -1989,7 +1989,7 @@
 // CHECK2-NEXT:    [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP1:%.*]] = load double*, double** [[__IM_ADDR]], align 8
-// CHECK2-NEXT:    call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR11]]
+// CHECK2-NEXT:    call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR10]]
 // CHECK2-NEXT:    ret void
 //
 //
@@ -2079,7 +2079,7 @@
 // CHECK2-NEXT:    [[TMP23:%.*]] = bitcast double* [[REF_TMP6]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP23]]) #[[ATTR5]]
 // CHECK2-NEXT:    store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA22]]
-// CHECK2-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR11]]
+// CHECK2-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[TMP24:%.*]] = bitcast double* [[REF_TMP6]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP24]]) #[[ATTR5]]
 // CHECK2-NEXT:    [[TMP25:%.*]] = bitcast double* [[REF_TMP]] to i8*
@@ -2141,8 +2141,8 @@
 // CHECK2-NEXT:    [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA8]]
 // CHECK2-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP44]] to double
 // CHECK2-NEXT:    store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA22]]
-// CHECK2-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR11]]
-// CHECK2-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR11]]
+// CHECK2-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR10]]
+// CHECK2-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[TMP45:%.*]] = bitcast double* [[REF_TMP16]] to i8*
 // CHECK2-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP45]]) #[[ATTR5]]
 // CHECK2-NEXT:    [[TMP46:%.*]] = bitcast double* [[REF_TMP15]] to i8*
@@ -2183,7 +2183,7 @@
 // CHECK2-NEXT:    [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1
 // CHECK2-NEXT:    br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK2:       .omp.reduction.then:
-// CHECK2-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR11]]
+// CHECK2-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR10]]
 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK2:       .omp.reduction.done:
@@ -2221,13 +2221,13 @@
 // CHECK2-NEXT:    store %"class.std::complex.0"* [[__C]], %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK2-NEXT:    [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR11]]
+// CHECK2-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24:![0-9]+]]
 // CHECK2-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
 // CHECK2-NEXT:    store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA24]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR11]]
+// CHECK2-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR10]]
 // CHECK2-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26:![0-9]+]]
 // CHECK2-NEXT:    [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
@@ -2263,8 +2263,8 @@
 // CHECK2-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK2-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
 // CHECK2:       .shuffle.pre_cond:
-// CHECK2-NEXT:    [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP28:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
-// CHECK2-NEXT:    [[TMP18:%.*]] = phi i64* [ [[TMP16]], [[ENTRY]] ], [ [[TMP29:%.*]], [[DOTSHUFFLE_THEN]] ]
+// CHECK2-NEXT:    [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP29:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
+// CHECK2-NEXT:    [[TMP18:%.*]] = phi i64* [ [[TMP16]], [[ENTRY]] ], [ [[TMP30:%.*]], [[DOTSHUFFLE_THEN]] ]
 // CHECK2-NEXT:    [[TMP19:%.*]] = bitcast i64* [[TMP17]] to i8*
 // CHECK2-NEXT:    [[TMP20:%.*]] = ptrtoint i8* [[TMP14]] to i64
 // CHECK2-NEXT:    [[TMP21:%.*]] = ptrtoint i8* [[TMP19]] to i64
@@ -2274,51 +2274,51 @@
 // CHECK2-NEXT:    br i1 [[TMP24]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
 // CHECK2:       .shuffle.then:
 // CHECK2-NEXT:    [[TMP25:%.*]] = load i64, i64* [[TMP17]], align 8
-// CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-// CHECK2-NEXT:    [[TMP26:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
-// CHECK2-NEXT:    [[TMP27:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP26]])
-// CHECK2-NEXT:    store i64 [[TMP27]], i64* [[TMP18]], align 8
-// CHECK2-NEXT:    [[TMP28]] = getelementptr i64, i64* [[TMP17]], i64 1
-// CHECK2-NEXT:    [[TMP29]] = getelementptr i64, i64* [[TMP18]], i64 1
+// CHECK2-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK2-NEXT:    [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
+// CHECK2-NEXT:    [[TMP28:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP27]])
+// CHECK2-NEXT:    store i64 [[TMP28]], i64* [[TMP18]], align 8
+// CHECK2-NEXT:    [[TMP29]] = getelementptr i64, i64* [[TMP17]], i64 1
+// CHECK2-NEXT:    [[TMP30]] = getelementptr i64, i64* [[TMP18]], i64 1
 // CHECK2-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK2:       .shuffle.exit:
-// CHECK2-NEXT:    [[TMP30:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK2-NEXT:    store i8* [[TMP30]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP8]], 0
-// CHECK2-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK2-NEXT:    [[TMP33:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
-// CHECK2-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
-// CHECK2-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 2
-// CHECK2-NEXT:    [[TMP36:%.*]] = and i16 [[TMP6]], 1
-// CHECK2-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP36]], 0
-// CHECK2-NEXT:    [[TMP38:%.*]] = and i1 [[TMP35]], [[TMP37]]
-// CHECK2-NEXT:    [[TMP39:%.*]] = icmp sgt i16 [[TMP7]], 0
-// CHECK2-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
-// CHECK2-NEXT:    [[TMP41:%.*]] = or i1 [[TMP31]], [[TMP34]]
-// CHECK2-NEXT:    [[TMP42:%.*]] = or i1 [[TMP41]], [[TMP40]]
-// CHECK2-NEXT:    br i1 [[TMP42]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK2-NEXT:    [[TMP31:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK2-NEXT:    store i8* [[TMP31]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK2-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK2-NEXT:    [[TMP33:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK2-NEXT:    [[TMP34:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK2-NEXT:    [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK2-NEXT:    [[TMP37:%.*]] = and i16 [[TMP6]], 1
+// CHECK2-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP37]], 0
+// CHECK2-NEXT:    [[TMP39:%.*]] = and i1 [[TMP36]], [[TMP38]]
+// CHECK2-NEXT:    [[TMP40:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK2-NEXT:    [[TMP41:%.*]] = and i1 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT:    [[TMP42:%.*]] = or i1 [[TMP32]], [[TMP35]]
+// CHECK2-NEXT:    [[TMP43:%.*]] = or i1 [[TMP42]], [[TMP41]]
+// CHECK2-NEXT:    br i1 [[TMP43]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK2:       then:
-// CHECK2-NEXT:    [[TMP43:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
-// CHECK2-NEXT:    [[TMP44:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
-// CHECK2-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]]
+// CHECK2-NEXT:    [[TMP44:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK2-NEXT:    [[TMP45:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK2-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP44]], i8* [[TMP45]]) #[[ATTR5]]
 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
 // CHECK2:       else:
 // CHECK2-NEXT:    br label [[IFCONT]]
 // CHECK2:       ifcont:
-// CHECK2-NEXT:    [[TMP45:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK2-NEXT:    [[TMP46:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
-// CHECK2-NEXT:    [[TMP47:%.*]] = and i1 [[TMP45]], [[TMP46]]
-// CHECK2-NEXT:    br i1 [[TMP47]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK2-NEXT:    [[TMP46:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK2-NEXT:    [[TMP47:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK2-NEXT:    [[TMP48:%.*]] = and i1 [[TMP46]], [[TMP47]]
+// CHECK2-NEXT:    br i1 [[TMP48]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK2:       then4:
-// CHECK2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP49:%.*]] = load i8*, i8** [[TMP48]], align 8
-// CHECK2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP51:%.*]] = load i8*, i8** [[TMP50]], align 8
-// CHECK2-NEXT:    [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.0"*
-// CHECK2-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.0"*
-// CHECK2-NEXT:    [[TMP54:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
-// CHECK2-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP52]] to i8*
-// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !27
+// CHECK2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP50:%.*]] = load i8*, i8** [[TMP49]], align 8
+// CHECK2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8
+// CHECK2-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP50]] to %"class.std::complex.0"*
+// CHECK2-NEXT:    [[TMP54:%.*]] = bitcast i8* [[TMP52]] to %"class.std::complex.0"*
+// CHECK2-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP54]] to i8*
+// CHECK2-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 16, i1 false), !tbaa.struct !27
 // CHECK2-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK2:       else5:
 // CHECK2-NEXT:    br label [[IFCONT6]]
@@ -2335,53 +2335,53 @@
 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK2-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK2-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
-// CHECK2-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
-// CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
+// CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK2-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
 // CHECK2-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK2-NEXT:    br label [[PRECOND:%.*]]
 // CHECK2:       precond:
-// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 4
-// CHECK2-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 4
+// CHECK2-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK2:       body:
 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK2:       then:
-// CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
-// CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
-// CHECK2-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK2-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
 // CHECK2:       else:
 // CHECK2-NEXT:    br label [[IFCONT]]
 // CHECK2:       ifcont:
 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
-// CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK2:       then2:
-// CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
-// CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
-// CHECK2-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8, !tbaa [[TBAA12]]
+// CHECK2-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK2-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK2:       else3:
-// CHECK2-NEXT:    br label [[IFCONT6]]
+// CHECK2-NEXT:    br label [[IFCONT4]]
 // CHECK2:       ifcont4:
-// CHECK2-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK2-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK2-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK2-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK2-NEXT:    br label [[PRECOND]]
 // CHECK2:       exit:
 // CHECK2-NEXT:    ret void
@@ -2534,11 +2534,11 @@
 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12:![0-9]+]]
 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK3-NEXT:    [[ISTART:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK3-NEXT:    [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32*
-// CHECK3-NEXT:    [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK3-NEXT:    [[IEND:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK3-NEXT:    [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32*
-// CHECK3-NEXT:    [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
+// CHECK3-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 8)
 // CHECK3-NEXT:    [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex"*
 // CHECK3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]]
@@ -2591,7 +2591,7 @@
 // CHECK3-NEXT:    [[TMP15:%.*]] = bitcast float* [[REF_TMP2]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP15]]) #[[ATTR5]]
 // CHECK3-NEXT:    store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA14]]
-// CHECK3-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR11:[0-9]+]]
+// CHECK3-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR10:[0-9]+]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = bitcast float* [[REF_TMP2]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP16]]) #[[ATTR5]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = bitcast float* [[REF_TMP]] to i8*
@@ -2656,7 +2656,7 @@
 // CHECK3-NEXT:    [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load float*, float** [[__RE_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP1:%.*]] = load float*, float** [[__IM_ADDR]], align 8
-// CHECK3-NEXT:    call void @_ZNSt7complexIfEC2ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[THIS1]], float* nonnull align 4 dereferenceable(4) [[TMP0]], float* nonnull align 4 dereferenceable(4) [[TMP1]]) #[[ATTR11]]
+// CHECK3-NEXT:    call void @_ZNSt7complexIfEC2ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[THIS1]], float* nonnull align 4 dereferenceable(4) [[TMP0]], float* nonnull align 4 dereferenceable(4) [[TMP1]]) #[[ATTR10]]
 // CHECK3-NEXT:    ret void
 //
 //
@@ -2746,7 +2746,7 @@
 // CHECK3-NEXT:    [[TMP23:%.*]] = bitcast float* [[REF_TMP6]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP23]]) #[[ATTR5]]
 // CHECK3-NEXT:    store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA14]]
-// CHECK3-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR11]]
+// CHECK3-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[TMP24:%.*]] = bitcast float* [[REF_TMP6]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP24]]) #[[ATTR5]]
 // CHECK3-NEXT:    [[TMP25:%.*]] = bitcast float* [[REF_TMP]] to i8*
@@ -2808,8 +2808,8 @@
 // CHECK3-NEXT:    [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA8]]
 // CHECK3-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP44]] to float
 // CHECK3-NEXT:    store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA14]]
-// CHECK3-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR11]]
-// CHECK3-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR11]]
+// CHECK3-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR10]]
+// CHECK3-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[TMP45:%.*]] = bitcast float* [[REF_TMP16]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP45]]) #[[ATTR5]]
 // CHECK3-NEXT:    [[TMP46:%.*]] = bitcast float* [[REF_TMP15]] to i8*
@@ -2850,7 +2850,7 @@
 // CHECK3-NEXT:    [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1
 // CHECK3-NEXT:    br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK3:       .omp.reduction.then:
-// CHECK3-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR11]]
+// CHECK3-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR10]]
 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK3:       .omp.reduction.done:
@@ -2888,13 +2888,13 @@
 // CHECK3-NEXT:    store %"class.std::complex"* [[__C]], %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK3-NEXT:    [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR11]]
+// CHECK3-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // CHECK3-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
 // CHECK3-NEXT:    store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA16]]
 // CHECK3-NEXT:    [[TMP2:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR11]]
+// CHECK3-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA18:![0-9]+]]
 // CHECK3-NEXT:    [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
@@ -2929,49 +2929,49 @@
 // CHECK3-NEXT:    [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
 // CHECK3-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
-// CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-// CHECK3-NEXT:    [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
-// CHECK3-NEXT:    [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
-// CHECK3-NEXT:    store i64 [[TMP19]], i64* [[TMP16]], align 4
-// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
-// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
-// CHECK3-NEXT:    [[TMP22:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP22]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
-// CHECK3-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK3-NEXT:    [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
-// CHECK3-NEXT:    [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
-// CHECK3-NEXT:    [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
-// CHECK3-NEXT:    [[TMP28:%.*]] = and i16 [[TMP6]], 1
-// CHECK3-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
-// CHECK3-NEXT:    [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
-// CHECK3-NEXT:    [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
-// CHECK3-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-// CHECK3-NEXT:    [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
-// CHECK3-NEXT:    [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
-// CHECK3-NEXT:    br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK3-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
+// CHECK3-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
+// CHECK3-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
+// CHECK3-NEXT:    [[TMP23:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK3-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK3-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK3-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+// CHECK3-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK3-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
+// CHECK3-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
+// CHECK3-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
+// CHECK3-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK3-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
+// CHECK3-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
+// CHECK3-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
+// CHECK3-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK3:       then:
-// CHECK3-NEXT:    [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
-// CHECK3-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
-// CHECK3-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR5]]
+// CHECK3-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK3-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK3-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR5]]
 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
 // CHECK3:       else:
 // CHECK3-NEXT:    br label [[IFCONT]]
 // CHECK3:       ifcont:
-// CHECK3-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK3-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
-// CHECK3-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
-// CHECK3-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK3-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK3-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK3-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK3:       then4:
-// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8
-// CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
-// CHECK3-NEXT:    [[TMP44:%.*]] = bitcast i8* [[TMP41]] to %"class.std::complex"*
-// CHECK3-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP43]] to %"class.std::complex"*
-// CHECK3-NEXT:    [[TMP46:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
-// CHECK3-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP44]] to i8*
-// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !21
+// CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
+// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
+// CHECK3-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to %"class.std::complex"*
+// CHECK3-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to %"class.std::complex"*
+// CHECK3-NEXT:    [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP46]] to i8*
+// CHECK3-NEXT:    [[TMP48:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 8, i1 false), !tbaa.struct !21
 // CHECK3-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK3:       else5:
 // CHECK3-NEXT:    br label [[IFCONT6]]
@@ -2988,53 +2988,53 @@
 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK3-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK3-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
-// CHECK3-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
-// CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
+// CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK3-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK3-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
 // CHECK3-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK3-NEXT:    br label [[PRECOND:%.*]]
 // CHECK3:       precond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
-// CHECK3-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
+// CHECK3-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK3:       body:
 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK3:       then:
-// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
-// CHECK3-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK3-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
 // CHECK3:       else:
 // CHECK3-NEXT:    br label [[IFCONT]]
 // CHECK3:       ifcont:
 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
-// CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK3:       then2:
-// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
-// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
-// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
-// CHECK3-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8, !tbaa [[TBAA12]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK3:       else3:
-// CHECK3-NEXT:    br label [[IFCONT6]]
+// CHECK3-NEXT:    br label [[IFCONT4]]
 // CHECK3:       ifcont4:
-// CHECK3-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK3-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK3-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK3-NEXT:    br label [[PRECOND]]
 // CHECK3:       exit:
 // CHECK3-NEXT:    ret void
@@ -3101,11 +3101,11 @@
 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK3-NEXT:    [[ISTART:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK3-NEXT:    [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32*
-// CHECK3-NEXT:    [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK3-NEXT:    [[IEND:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK3-NEXT:    [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32*
-// CHECK3-NEXT:    [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 16)
+// CHECK3-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 16)
 // CHECK3-NEXT:    [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex.0"*
 // CHECK3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]]
@@ -3158,7 +3158,7 @@
 // CHECK3-NEXT:    [[TMP15:%.*]] = bitcast double* [[REF_TMP2]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP15]]) #[[ATTR5]]
 // CHECK3-NEXT:    store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA22]]
-// CHECK3-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR11]]
+// CHECK3-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = bitcast double* [[REF_TMP2]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP16]]) #[[ATTR5]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = bitcast double* [[REF_TMP]] to i8*
@@ -3223,7 +3223,7 @@
 // CHECK3-NEXT:    [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP1:%.*]] = load double*, double** [[__IM_ADDR]], align 8
-// CHECK3-NEXT:    call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR11]]
+// CHECK3-NEXT:    call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR10]]
 // CHECK3-NEXT:    ret void
 //
 //
@@ -3313,7 +3313,7 @@
 // CHECK3-NEXT:    [[TMP23:%.*]] = bitcast double* [[REF_TMP6]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP23]]) #[[ATTR5]]
 // CHECK3-NEXT:    store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA22]]
-// CHECK3-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR11]]
+// CHECK3-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[TMP24:%.*]] = bitcast double* [[REF_TMP6]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP24]]) #[[ATTR5]]
 // CHECK3-NEXT:    [[TMP25:%.*]] = bitcast double* [[REF_TMP]] to i8*
@@ -3375,8 +3375,8 @@
 // CHECK3-NEXT:    [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA8]]
 // CHECK3-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP44]] to double
 // CHECK3-NEXT:    store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA22]]
-// CHECK3-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR11]]
-// CHECK3-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR11]]
+// CHECK3-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR10]]
+// CHECK3-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[TMP45:%.*]] = bitcast double* [[REF_TMP16]] to i8*
 // CHECK3-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP45]]) #[[ATTR5]]
 // CHECK3-NEXT:    [[TMP46:%.*]] = bitcast double* [[REF_TMP15]] to i8*
@@ -3417,7 +3417,7 @@
 // CHECK3-NEXT:    [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1
 // CHECK3-NEXT:    br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK3:       .omp.reduction.then:
-// CHECK3-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR11]]
+// CHECK3-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR10]]
 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK3:       .omp.reduction.done:
@@ -3455,13 +3455,13 @@
 // CHECK3-NEXT:    store %"class.std::complex.0"* [[__C]], %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK3-NEXT:    [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR11]]
+// CHECK3-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24:![0-9]+]]
 // CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
 // CHECK3-NEXT:    store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA24]]
 // CHECK3-NEXT:    [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR11]]
+// CHECK3-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR10]]
 // CHECK3-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26:![0-9]+]]
 // CHECK3-NEXT:    [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
@@ -3497,8 +3497,8 @@
 // CHECK3-NEXT:    [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
 // CHECK3-NEXT:    br label [[DOTSHUFFLE_PRE_COND:%.*]]
 // CHECK3:       .shuffle.pre_cond:
-// CHECK3-NEXT:    [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP28:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
-// CHECK3-NEXT:    [[TMP18:%.*]] = phi i64* [ [[TMP16]], [[ENTRY]] ], [ [[TMP29:%.*]], [[DOTSHUFFLE_THEN]] ]
+// CHECK3-NEXT:    [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP29:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
+// CHECK3-NEXT:    [[TMP18:%.*]] = phi i64* [ [[TMP16]], [[ENTRY]] ], [ [[TMP30:%.*]], [[DOTSHUFFLE_THEN]] ]
 // CHECK3-NEXT:    [[TMP19:%.*]] = bitcast i64* [[TMP17]] to i8*
 // CHECK3-NEXT:    [[TMP20:%.*]] = ptrtoint i8* [[TMP14]] to i64
 // CHECK3-NEXT:    [[TMP21:%.*]] = ptrtoint i8* [[TMP19]] to i64
@@ -3508,51 +3508,51 @@
 // CHECK3-NEXT:    br i1 [[TMP24]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
 // CHECK3:       .shuffle.then:
 // CHECK3-NEXT:    [[TMP25:%.*]] = load i64, i64* [[TMP17]], align 8
-// CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-// CHECK3-NEXT:    [[TMP26:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
-// CHECK3-NEXT:    [[TMP27:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP26]])
-// CHECK3-NEXT:    store i64 [[TMP27]], i64* [[TMP18]], align 8
-// CHECK3-NEXT:    [[TMP28]] = getelementptr i64, i64* [[TMP17]], i64 1
-// CHECK3-NEXT:    [[TMP29]] = getelementptr i64, i64* [[TMP18]], i64 1
+// CHECK3-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK3-NEXT:    [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
+// CHECK3-NEXT:    [[TMP28:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP27]])
+// CHECK3-NEXT:    store i64 [[TMP28]], i64* [[TMP18]], align 8
+// CHECK3-NEXT:    [[TMP29]] = getelementptr i64, i64* [[TMP17]], i64 1
+// CHECK3-NEXT:    [[TMP30]] = getelementptr i64, i64* [[TMP18]], i64 1
 // CHECK3-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK3:       .shuffle.exit:
-// CHECK3-NEXT:    [[TMP30:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
-// CHECK3-NEXT:    store i8* [[TMP30]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[TMP31:%.*]] = icmp eq i16 [[TMP8]], 0
-// CHECK3-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK3-NEXT:    [[TMP33:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
-// CHECK3-NEXT:    [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
-// CHECK3-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 2
-// CHECK3-NEXT:    [[TMP36:%.*]] = and i16 [[TMP6]], 1
-// CHECK3-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP36]], 0
-// CHECK3-NEXT:    [[TMP38:%.*]] = and i1 [[TMP35]], [[TMP37]]
-// CHECK3-NEXT:    [[TMP39:%.*]] = icmp sgt i16 [[TMP7]], 0
-// CHECK3-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
-// CHECK3-NEXT:    [[TMP41:%.*]] = or i1 [[TMP31]], [[TMP34]]
-// CHECK3-NEXT:    [[TMP42:%.*]] = or i1 [[TMP41]], [[TMP40]]
-// CHECK3-NEXT:    br i1 [[TMP42]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK3-NEXT:    [[TMP31:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK3-NEXT:    store i8* [[TMP31]], i8** [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK3-NEXT:    [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK3-NEXT:    [[TMP33:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK3-NEXT:    [[TMP34:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT:    [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]]
+// CHECK3-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK3-NEXT:    [[TMP37:%.*]] = and i16 [[TMP6]], 1
+// CHECK3-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP37]], 0
+// CHECK3-NEXT:    [[TMP39:%.*]] = and i1 [[TMP36]], [[TMP38]]
+// CHECK3-NEXT:    [[TMP40:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK3-NEXT:    [[TMP41:%.*]] = and i1 [[TMP39]], [[TMP40]]
+// CHECK3-NEXT:    [[TMP42:%.*]] = or i1 [[TMP32]], [[TMP35]]
+// CHECK3-NEXT:    [[TMP43:%.*]] = or i1 [[TMP42]], [[TMP41]]
+// CHECK3-NEXT:    br i1 [[TMP43]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK3:       then:
-// CHECK3-NEXT:    [[TMP43:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
-// CHECK3-NEXT:    [[TMP44:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
-// CHECK3-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]]
+// CHECK3-NEXT:    [[TMP44:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK3-NEXT:    [[TMP45:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK3-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP44]], i8* [[TMP45]]) #[[ATTR5]]
 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
 // CHECK3:       else:
 // CHECK3-NEXT:    br label [[IFCONT]]
 // CHECK3:       ifcont:
-// CHECK3-NEXT:    [[TMP45:%.*]] = icmp eq i16 [[TMP8]], 1
-// CHECK3-NEXT:    [[TMP46:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
-// CHECK3-NEXT:    [[TMP47:%.*]] = and i1 [[TMP45]], [[TMP46]]
-// CHECK3-NEXT:    br i1 [[TMP47]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK3-NEXT:    [[TMP46:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK3-NEXT:    [[TMP47:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT:    [[TMP48:%.*]] = and i1 [[TMP46]], [[TMP47]]
+// CHECK3-NEXT:    br i1 [[TMP48]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
 // CHECK3:       then4:
-// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP49:%.*]] = load i8*, i8** [[TMP48]], align 8
-// CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP51:%.*]] = load i8*, i8** [[TMP50]], align 8
-// CHECK3-NEXT:    [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.0"*
-// CHECK3-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.0"*
-// CHECK3-NEXT:    [[TMP54:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
-// CHECK3-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP52]] to i8*
-// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !27
+// CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP50:%.*]] = load i8*, i8** [[TMP49]], align 8
+// CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8
+// CHECK3-NEXT:    [[TMP53:%.*]] = bitcast i8* [[TMP50]] to %"class.std::complex.0"*
+// CHECK3-NEXT:    [[TMP54:%.*]] = bitcast i8* [[TMP52]] to %"class.std::complex.0"*
+// CHECK3-NEXT:    [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP54]] to i8*
+// CHECK3-NEXT:    [[TMP56:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8*
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 16, i1 false), !tbaa.struct !27
 // CHECK3-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK3:       else5:
 // CHECK3-NEXT:    br label [[IFCONT6]]
@@ -3569,53 +3569,53 @@
 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK3-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK3-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
-// CHECK3-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
-// CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
-// CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
+// CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK3-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK3-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
 // CHECK3-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK3-NEXT:    br label [[PRECOND:%.*]]
 // CHECK3:       precond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 4
-// CHECK3-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 4
+// CHECK3-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK3:       body:
 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK3:       then:
-// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
-// CHECK3-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK3-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
 // CHECK3:       else:
 // CHECK3-NEXT:    br label [[IFCONT]]
 // CHECK3:       ifcont:
 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
-// CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK3:       then2:
-// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
-// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
-// CHECK3-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
-// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
-// CHECK3-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
-// CHECK3-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8, !tbaa [[TBAA12]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK3:       else3:
-// CHECK3-NEXT:    br label [[IFCONT6]]
+// CHECK3-NEXT:    br label [[IFCONT4]]
 // CHECK3:       ifcont4:
-// CHECK3-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK3-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK3-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK3-NEXT:    br label [[PRECOND]]
 // CHECK3:       exit:
 // CHECK3-NEXT:    ret void
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
@@ -607,7 +607,7 @@
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[I:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[I:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32*
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
@@ -729,7 +729,7 @@
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
-// CHECK2-NEXT:    [[I:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK2-NEXT:    [[I:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32*
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
 // CHECK2-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
@@ -851,7 +851,7 @@
 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
-// CHECK3-NEXT:    [[I:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK3-NEXT:    [[I:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK3-NEXT:    [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32*
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -18534,7 +18534,7 @@
 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
 // CHECK1-NEXT:    [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32*
-// CHECK1-NEXT:    [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[L2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32*
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@@ -20128,7 +20128,7 @@
 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
 // CHECK2-NEXT:    [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8
 // CHECK2-NEXT:    [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32*
-// CHECK2-NEXT:    [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK2-NEXT:    [[L2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK2-NEXT:    [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32*
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@@ -21711,7 +21711,7 @@
 // CHECK3-NEXT:    store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[L]], i32* [[L_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK3-NEXT:    [[L1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK3-NEXT:    [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32*
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@@ -23245,7 +23245,7 @@
 // CHECK4-NEXT:    store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[L]], i32* [[L_ADDR]], align 4
 // CHECK4-NEXT:    [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4
-// CHECK4-NEXT:    [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK4-NEXT:    [[L1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK4-NEXT:    [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32*
 // CHECK4-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -9447,7 +9447,7 @@
 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
 // CHECK1-NEXT:    [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32*
-// CHECK1-NEXT:    [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[L2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32*
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@@ -10525,7 +10525,7 @@
 // CHECK2-NEXT:    store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[L]], i32* [[L_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK2-NEXT:    [[L1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32*
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
@@ -11563,7 +11563,7 @@
 // CHECK3-NEXT:    store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[L]], i32* [[L_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK3-NEXT:    [[L1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK3-NEXT:    [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32*
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp
--- a/clang/test/OpenMP/nvptx_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp
@@ -903,7 +903,7 @@
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
-// CHECK1-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -943,7 +943,7 @@
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
-// CHECK1-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
+// CHECK1-NEXT:    [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
 // CHECK1-NEXT:    store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -983,7 +983,7 @@
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
-// CHECK2-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK2-NEXT:    [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -1023,7 +1023,7 @@
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
-// CHECK2-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK2-NEXT:    [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
 // CHECK2-NEXT:    store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 4
 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -1070,7 +1070,7 @@
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV2]], align 4
-// CHECK3-NEXT:    [[ARGC3:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK3-NEXT:    [[ARGC3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK3-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC3]] to i32*
 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -1116,7 +1116,7 @@
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
-// CHECK3-NEXT:    [[ARGC2:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
+// CHECK3-NEXT:    [[ARGC2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
 // CHECK3-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC2]] to i8***
 // CHECK3-NEXT:    store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 8
 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -1160,7 +1160,7 @@
 // CHECK4-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK4:       user_code.entry:
 // CHECK4-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
-// CHECK4-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK4-NEXT:    [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK4-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i32*
 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4
 // CHECK4-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -1204,7 +1204,7 @@
 // CHECK4-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK4:       user_code.entry:
 // CHECK4-NEXT:    [[TMP1:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
-// CHECK4-NEXT:    [[ARGC1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK4-NEXT:    [[ARGC1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK4-NEXT:    [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC1]] to i8***
 // CHECK4-NEXT:    store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 4
 // CHECK4-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -4219,7 +4219,7 @@
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = load double, double* [[CONV]], align 8
-// CHECK1-NEXT:    [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
+// CHECK1-NEXT:    [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
 // CHECK1-NEXT:    store double [[TMP1]], double* [[E_ON_STACK]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -4244,7 +4244,7 @@
 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    store double* [[E]], double** [[E_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8
-// CHECK1-NEXT:    [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8)
+// CHECK1-NEXT:    [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
 // CHECK1-NEXT:    store double 0.000000e+00, double* [[E_ON_STACK]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
@@ -4521,10 +4521,10 @@
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
-// CHECK1-NEXT:    [[C2:%.*]] = call i8* @__kmpc_alloc_shared(i64 1)
+// CHECK1-NEXT:    [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1)
 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[C2]], align 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
-// CHECK1-NEXT:    [[D3:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
 // CHECK1-NEXT:    store float [[TMP2]], float* [[D_ON_STACK]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -4553,8 +4553,8 @@
 // CHECK1-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8
-// CHECK1-NEXT:    [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 1)
-// CHECK1-NEXT:    [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK1-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1)
+// CHECK1-NEXT:    [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
 // CHECK1-NEXT:    store i8 0, i8* [[C1]], align 1
 // CHECK1-NEXT:    store float 1.000000e+00, float* [[D_ON_STACK]], align 4
@@ -5563,7 +5563,7 @@
 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
 // CHECK2-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
-// CHECK2-NEXT:    [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i32 8)
+// CHECK2-NEXT:    [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8)
 // CHECK2-NEXT:    [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
 // CHECK2-NEXT:    store double 0.000000e+00, double* [[E_ON_STACK]], align 8
 // CHECK2-NEXT:    [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
@@ -5840,10 +5840,10 @@
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
-// CHECK2-NEXT:    [[C2:%.*]] = call i8* @__kmpc_alloc_shared(i32 1)
+// CHECK2-NEXT:    [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[C2]], align 1
 // CHECK2-NEXT:    [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
-// CHECK2-NEXT:    [[D3:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK2-NEXT:    [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
 // CHECK2-NEXT:    store float [[TMP2]], float* [[D_ON_STACK]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -5872,8 +5872,8 @@
 // CHECK2-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
-// CHECK2-NEXT:    [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 1)
-// CHECK2-NEXT:    [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK2-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
+// CHECK2-NEXT:    [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
 // CHECK2-NEXT:    store i8 0, i8* [[C1]], align 1
 // CHECK2-NEXT:    store float 1.000000e+00, float* [[D_ON_STACK]], align 4
@@ -6881,7 +6881,7 @@
 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
 // CHECK3-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
-// CHECK3-NEXT:    [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i32 8)
+// CHECK3-NEXT:    [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8)
 // CHECK3-NEXT:    [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
 // CHECK3-NEXT:    store double 0.000000e+00, double* [[E_ON_STACK]], align 8
 // CHECK3-NEXT:    [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
@@ -7158,10 +7158,10 @@
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
-// CHECK3-NEXT:    [[C2:%.*]] = call i8* @__kmpc_alloc_shared(i32 1)
+// CHECK3-NEXT:    [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[C2]], align 1
 // CHECK3-NEXT:    [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
-// CHECK3-NEXT:    [[D3:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK3-NEXT:    [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK3-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
 // CHECK3-NEXT:    store float [[TMP2]], float* [[D_ON_STACK]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@@ -7190,8 +7190,8 @@
 // CHECK3-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
-// CHECK3-NEXT:    [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 1)
-// CHECK3-NEXT:    [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
+// CHECK3-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
+// CHECK3-NEXT:    [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
 // CHECK3-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
 // CHECK3-NEXT:    store i8 0, i8* [[C1]], align 1
 // CHECK3-NEXT:    store float 1.000000e+00, float* [[D_ON_STACK]], align 4
diff --git a/clang/test/OpenMP/parallel_num_threads_codegen.cpp b/clang/test/OpenMP/parallel_num_threads_codegen.cpp
--- a/clang/test/OpenMP/parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/parallel_num_threads_codegen.cpp
@@ -15,7 +15,7 @@
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[S_TY:%.+]] = type { [[INTPTR_T_TY:i[0-9]+]], [[INTPTR_T_TY]], [[INTPTR_T_TY]] }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/parallel_proc_bind_codegen.cpp
--- a/clang/test/OpenMP/parallel_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/parallel_proc_bind_codegen.cpp
@@ -14,7 +14,7 @@
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/parallel_proc_bind_primary_codegen.cpp b/clang/test/OpenMP/parallel_proc_bind_primary_codegen.cpp
--- a/clang/test/OpenMP/parallel_proc_bind_primary_codegen.cpp
+++ b/clang/test/OpenMP/parallel_proc_bind_primary_codegen.cpp
@@ -15,7 +15,7 @@
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/sections_codegen.cpp b/clang/test/OpenMP/sections_codegen.cpp
--- a/clang/test/OpenMP/sections_codegen.cpp
+++ b/clang/test/OpenMP/sections_codegen.cpp
@@ -9,8 +9,8 @@
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
-// CHECK-DAG: [[IMPLICIT_BARRIER_SECTIONS_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
-// CHECK-DAG: [[SECTIONS_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 1026, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_SECTIONS_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 {{[0-9]+}}, i8*
+// CHECK-DAG: [[SECTIONS_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 1026, i32 0, i32 {{[0-9]+}}, i8*
 // CHECK-LABEL: foo
 void foo() { extern void mayThrow(); mayThrow(); };
 // CHECK-LABEL: bar
diff --git a/clang/test/SemaCXX/PR51712-large-array-constexpr-check-oom.cpp b/clang/test/SemaCXX/PR51712-large-array-constexpr-check-oom.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaCXX/PR51712-large-array-constexpr-check-oom.cpp
@@ -0,0 +1,17 @@
+// Only run this test where ulimit is known to work well.
+// (There's nothing really platform-specific being tested, this is just ulimit).
+//
+// REQUIRES: shell
+// REQUIRES: system-linux
+// UNSUPPORTED: msan
+// UNSUPPORTED: asan
+//
+// RUN: ulimit -v 1048576
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -triple=x86_64 %s
+// expected-no-diagnostics
+
+// This used to require too much memory and crash with OOM.
+struct {
+  int a, b, c, d;
+} arr[1<<30];
+
diff --git a/clang/test/CodeGenOpenCLCXX/remove-address-space.clcpp b/clang/test/SemaOpenCLCXX/remove-address-space.clcpp
rename from clang/test/CodeGenOpenCLCXX/remove-address-space.clcpp
rename to clang/test/SemaOpenCLCXX/remove-address-space.clcpp
--- a/clang/test/CodeGenOpenCLCXX/remove-address-space.clcpp
+++ b/clang/test/SemaOpenCLCXX/remove-address-space.clcpp
@@ -1,4 +1,8 @@
-// RUN: %clang_cc1 %s -cl-std=clc++ -fdeclare-opencl-builtins -finclude-default-header
+// RUN: %clang_cc1 %s -cl-std=clc++1.0 -triple spir-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -verify
+// RUN: %clang_cc1 %s -cl-std=clc++2021 -triple spir-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -verify
+// RUN: %clang_cc1 %s -cl-std=clc++2021 -cl-ext=-__opencl_c_generic_address_space,-__opencl_c_pipes -triple spir-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -verify
+
+// expected-no-diagnostics
 
 template<typename T, typename U>
 struct is_same {
@@ -19,8 +23,10 @@
 void test_remove_address_space() {
   static_assert(is_same<__remove_address_space<int>::type, int>::value,
                 "type without an address space unexpectedly modified by __remove_address_space");
+#if defined(__opencl_c_generic_address_space)
   static_assert(is_same<__remove_address_space<__generic int>::type, int>::value,
                 "__generic address space not removed by __remove_address_space");
+#endif
   static_assert(is_same<__remove_address_space<__global char>::type, char>::value,
                 "__global address space not removed by __remove_address_space");
   static_assert(is_same<__remove_address_space<__private ulong>::type, ulong>::value,
diff --git a/clang/test/SemaSYCL/zero-length-arrays.cpp b/clang/test/SemaSYCL/zero-length-arrays.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaSYCL/zero-length-arrays.cpp
@@ -0,0 +1,125 @@
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -fsyntax-only -verify %s
+//
+// This test checks if compiler reports compilation error on an attempt to use
+// a zero-length array inside device code.
+
+template <typename Name, typename Func>
+__attribute__((sycl_kernel)) void kernel(const Func &kernelFunc) {
+  // expected-note@+1 5{{called by 'kernel}}
+  kernelFunc(); // #KernelObjCall
+}
+
+typedef float ZEROARR[0];
+
+struct Wrapper {
+  int A;
+  int BadArray[0]; // expected-note 3{{field of illegal type 'int[0]' declared here}}
+};
+
+struct WrapperOfWrapper { // expected-error 2{{zero-length arrays are not permitted in SYCL device code}}
+  Wrapper F;              // expected-note 2{{within field of type 'Wrapper' declared here}}
+  ZEROARR *Ptr;           //expected-note 5{{field of illegal pointer type 'ZEROARR *' (aka 'float (*)[0]') declared here}}
+};
+
+template <unsigned Size> struct InnerTemplated {
+  double Array[Size]; // expected-note 8{{field of illegal type 'double[0]' declared here}}
+};
+
+template <unsigned Size, typename Ty> struct Templated {
+  unsigned A;
+  Ty Arr[Size];
+  InnerTemplated<Size> Array[Size + 1]; // expected-note 8{{within field of type 'InnerTemplated<0U>[1]' declared here}}
+};
+
+struct KernelSt {
+  int A;
+  int BadArray[0]; // expected-note {{field of illegal type 'int[0]' declared here}}
+  void operator()() const {}
+};
+
+WrapperOfWrapper offendingFoo() {
+  // expected-note@+1 {{called by 'offendingFoo'}}
+  return WrapperOfWrapper{};
+}
+
+template <unsigned Size>
+void templatedContext() {
+  Templated<Size, float> Var;
+  // expected-error@#KernelObjCall 2{{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@#KernelObjCall {{called by 'kernel<TempContext, (lambda at}}
+  // expected-note@+1 {{in instantiation of function template specialization}}
+  kernel<class TempContext>([=] {
+    // expected-note@+1 {{within field of type 'Templated<0U, float>' declared here}}
+    (void)Var; // expected-error 2{{zero-length arrays are not permitted in SYCL device code}}
+  });
+  // expected-error@#KernelObjCall {{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@+2 {{in instantiation of function template specialization}}
+  // expected-note@+1 {{within field of type 'Templated<0U, float>' declared here}}
+  kernel<class TempContext1>([Var] {
+  });
+}
+
+void foo(const unsigned X) {
+  int Arr[0];      // expected-note 2{{declared here}}
+  ZEROARR TypeDef; // expected-note {{declared here}}
+  ZEROARR *Ptr;    // expected-note {{declared here}}
+                   // expected-error@#KernelObjCall 3{{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@+1 {{in instantiation of function template specialization}}
+  kernel<class Simple>([=]() {
+    (void)Arr;     // expected-error {{zero-length arrays are not permitted in SYCL device code}}
+    (void)TypeDef; // expected-error {{zero-length arrays are not permitted in SYCL device code}}
+    // expected-note@+1 {{field of illegal pointer type 'ZEROARR *' (aka 'float (*)[0]') declared here}}
+    (void)Ptr; // expected-error {{zero-length arrays are not permitted in SYCL device code}}
+  });
+  // expected-error@#KernelObjCall {{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@+2 {{in instantiation of function template specialization}}
+  // expected-note@+1 {{field of illegal type 'int[0]' declared here}}
+  kernel<class Simple1>([Arr] { // expected-error {{zero-length arrays are not permitted in SYCL device code}}
+  });
+  WrapperOfWrapper St;
+  // expected-error@#KernelObjCall 2{{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@+1 {{in instantiation of function template specialization}}
+  kernel<class SimpleStruct>([=] {
+    // expected-note@+1 {{within field of type 'WrapperOfWrapper' declared here}}
+    (void)St.F.BadArray; // expected-error 4{{zero-length arrays are not permitted in SYCL device code}}
+  });
+  // expected-error@#KernelObjCall 2{{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@+2 {{in instantiation of function template specialization}}
+  // expected-note@+1 {{within field of type 'WrapperOfWrapper' declared here}}
+  kernel<class SimpleStruct1>([St] { // expected-error 2{{zero-length arrays are not permitted in SYCL device code}}
+  });
+
+  Templated<1, int> OK;
+  Templated<1 - 1, double> Weirdo;
+  Templated<0, float> Zero;
+  // expected-error@#KernelObjCall 4{{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@+1 {{in instantiation of function template specialization}}
+  kernel<class UseTemplated>([=] {
+    (void)OK;   // No errors expected
+    (void)Zero; // expected-error 2{{zero-length arrays are not permitted in SYCL device code}}
+    // expected-note@+1 {{within field of type 'Templated<1 - 1, double>' declared here}}
+    int A = Weirdo.A; // expected-error 2{{zero-length arrays are not permitted in SYCL device code}}
+  });
+
+  // expected-note@#KernelObjCall {{called by 'kernel<UseTemplated1, (lambda at}}
+  // expected-error@#KernelObjCall 2{{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@+2 {{in instantiation of function template specialization}}
+  // expected-note@+1 {{within field of type 'Templated<0, float>' declared here}}
+  kernel<class UseTemplated1>([Zero] { // expected-error 2{{zero-length arrays are not permitted in SYCL device code}}
+  });
+
+  templatedContext<10>();
+  // expected-note@+1 2{{in instantiation of function template specialization}}
+  templatedContext<0>();
+
+  KernelSt K;
+  // expected-error@#KernelObjCall {{zero-length arrays are not permitted in SYCL device code}}
+  // expected-note@+1 {{in instantiation of function template specialization}}
+  kernel<class UseFunctor>(K);
+
+  // expected-note@#KernelObjCall {{called by 'kernel<ReturnFromFunc, (lambda at}}
+  kernel<class ReturnFromFunc>([=] {
+    // expected-note@+1 {{called by 'operator()'}}
+    offendingFoo();
+  });
+}
diff --git a/clang/test/SemaTemplate/constraints.cpp b/clang/test/SemaTemplate/constraints.cpp
--- a/clang/test/SemaTemplate/constraints.cpp
+++ b/clang/test/SemaTemplate/constraints.cpp
@@ -24,3 +24,35 @@
   // FIXME: These diagnostics are excessive.
   static_assert(test<false, char> == 1); // expected-note 2{{while}} expected-note 2{{during}}
 }
+
+namespace PR52905 {
+// A mock for std::convertible_to. Not complete support.
+template <typename _From, typename _To>
+concept convertible_to = __is_convertible_to(_From, _To); // expected-note {{evaluated to false}}
+
+template <typename T>
+class A {
+public:
+  using iterator = void **;
+
+  iterator begin();
+  const iterator begin() const;
+};
+
+template <class T>
+concept Beginable1 = requires(T t) {
+  { t.begin }
+  ->convertible_to<typename T::iterator>; // expected-note {{not satisfied}}
+};
+
+static_assert(Beginable1<A<int>>); // expected-error {{static_assert failed}}
+                                   // expected-note@-1 {{does not satisfy 'Beginable1'}}
+
+template <class T>
+concept Beginable2 = requires(T t) {
+  { t.begin() }
+  ->convertible_to<typename T::iterator>;
+};
+
+static_assert(Beginable2<A<int>>);
+} // namespace PR52905
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -4949,7 +4949,7 @@
 
 CXPrintingPolicy clang_getCursorPrintingPolicy(CXCursor C) {
   if (clang_Cursor_isNull(C))
-    return 0;
+    return nullptr;
   return new PrintingPolicy(getCursorContext(C).getPrintingPolicy());
 }
 
@@ -6975,16 +6975,16 @@
 
   if (isNotUsableTU(TU)) {
     LOG_BAD_TU(TU);
-    return NULL;
+    return nullptr;
   }
 
   ASTUnit *CXXUnit = cxtu::getASTUnit(TU);
   if (!CXXUnit)
-    return NULL;
+    return nullptr;
 
   SourceLocation Begin = cxloc::translateSourceLocation(Location);
   if (Begin.isInvalid())
-    return NULL;
+    return nullptr;
   SourceManager &SM = CXXUnit->getSourceManager();
   std::pair<FileID, unsigned> DecomposedEnd = SM.getDecomposedLoc(Begin);
   DecomposedEnd.second +=
@@ -6997,7 +6997,7 @@
   getTokens(CXXUnit, SourceRange(Begin, End), CXTokens);
 
   if (CXTokens.empty())
-    return NULL;
+    return nullptr;
 
   CXTokens.resize(1);
   CXToken *Token = static_cast<CXToken *>(llvm::safe_malloc(sizeof(CXToken)));
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
@@ -20,9 +20,12 @@
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Serialization/PCHContainerOperations.h"
+#include "clang/Tooling/ArgumentsAdjusters.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -56,12 +59,6 @@
 buildStatementToAnnotationMapping(const FunctionDecl *Func,
                                   llvm::Annotations AnnotatedCode);
 
-// Creates a CFG from the body of the function that matches `func_matcher`,
-// suitable to testing a dataflow analysis.
-std::pair<const FunctionDecl *, std::unique_ptr<CFG>>
-buildCFG(ASTContext &Context,
-         ast_matchers::internal::Matcher<FunctionDecl> FuncMatcher);
-
 // Runs dataflow on the body of the function that matches `func_matcher` in code
 // snippet `code`. Requires: `Analysis` contains a type `Lattice`.
 template <typename AnalysisT>
@@ -79,7 +76,10 @@
   using StateT = DataflowAnalysisState<typename AnalysisT::Lattice>;
 
   llvm::Annotations AnnotatedCode(Code);
-  auto Unit = tooling::buildASTFromCodeWithArgs(AnnotatedCode.code(), Args);
+  auto Unit = tooling::buildASTFromCodeWithArgs(
+      AnnotatedCode.code(), Args, "input.cc", "clang-dataflow-test",
+      std::make_shared<PCHContainerOperations>(),
+      tooling::getClangStripDependencyFileAdjuster(), VirtualMappedFiles);
   auto &Context = Unit->getASTContext();
 
   if (Context.getDiagnostics().getClient()->getNumErrors() != 0) {
@@ -87,12 +87,16 @@
               "the test log";
   }
 
-  std::pair<const FunctionDecl *, std::unique_ptr<CFG>> CFGResult =
-      buildCFG(Context, FuncMatcher);
-  const auto *F = CFGResult.first;
-  auto Cfg = std::move(CFGResult.second);
-  ASSERT_TRUE(F != nullptr) << "Could not find target function";
-  ASSERT_TRUE(Cfg != nullptr) << "Could not build control flow graph.";
+  const FunctionDecl *F = ast_matchers::selectFirst<FunctionDecl>(
+      "target",
+      ast_matchers::match(
+          ast_matchers::functionDecl(ast_matchers::isDefinition(), FuncMatcher)
+              .bind("target"),
+          Context));
+  ASSERT_TRUE(F != nullptr) << "Could not find target function.";
+
+  auto CFCtx = ControlFlowContext::build(F, F->getBody(), &F->getASTContext());
+  ASSERT_TRUE((bool)CFCtx) << "Could not build ControlFlowContext.";
 
   Environment Env;
   auto Analysis = MakeAnalysis(Context, Env);
@@ -107,7 +111,7 @@
   auto &Annotations = *StmtToAnnotations;
 
   std::vector<llvm::Optional<TypeErasedDataflowAnalysisState>> BlockStates =
-      runTypeErasedDataflowAnalysis(*Cfg, Analysis, Env);
+      runTypeErasedDataflowAnalysis(*CFCtx, Analysis, Env);
 
   if (BlockStates.empty()) {
     Expectations({}, Context);
@@ -117,13 +121,13 @@
   // Compute a map from statement annotations to the state computed for
   // the program point immediately after the annotated statement.
   std::vector<std::pair<std::string, StateT>> Results;
-  for (const CFGBlock *Block : *Cfg) {
+  for (const CFGBlock *Block : CFCtx->getCFG()) {
     // Skip blocks that were not evaluated.
     if (!BlockStates[Block->getBlockID()].hasValue())
       continue;
 
     transferBlock(
-        BlockStates, *Block, Env, Analysis,
+        *CFCtx, BlockStates, *Block, Env, Analysis,
         [&Results, &Annotations](const clang::CFGStmt &Stmt,
                                  const TypeErasedDataflowAnalysisState &State) {
           auto It = Annotations.find(Stmt.getStmt());
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
@@ -144,26 +144,3 @@
 
   return Result;
 }
-
-std::pair<const FunctionDecl *, std::unique_ptr<CFG>>
-test::buildCFG(ASTContext &Context,
-               ast_matchers::internal::Matcher<FunctionDecl> FuncMatcher) {
-  CFG::BuildOptions Options;
-  Options.PruneTriviallyFalseEdges = false;
-  Options.AddInitializers = true;
-  Options.AddImplicitDtors = true;
-  Options.AddTemporaryDtors = true;
-  Options.setAllAlwaysAdd();
-
-  const FunctionDecl *F = ast_matchers::selectFirst<FunctionDecl>(
-      "target",
-      ast_matchers::match(
-          ast_matchers::functionDecl(ast_matchers::isDefinition(), FuncMatcher)
-              .bind("target"),
-          Context));
-  if (F == nullptr)
-    return std::make_pair(nullptr, nullptr);
-
-  return std::make_pair(
-      F, clang::CFG::buildCFG(F, F->getBody(), &Context, Options));
-}
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "TestingSupport.h"
 #include "clang/AST/Decl.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -14,15 +15,24 @@
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
 #include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <cassert>
 #include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
 #include <vector>
 
 using namespace clang;
 using namespace dataflow;
+using ::testing::IsEmpty;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
 
 template <typename AnalysisT>
 class AnalysisCallback : public ast_matchers::MatchFinder::MatchCallback {
@@ -36,21 +46,12 @@
     Stmt *Body = Func->getBody();
     assert(Body != nullptr);
 
-    // FIXME: Consider providing a utility that returns a `CFG::BuildOptions`
-    // which is a good default for most clients or a utility that directly
-    // builds the `CFG` using default `CFG::BuildOptions`.
-    CFG::BuildOptions Options;
-    Options.AddImplicitDtors = true;
-    Options.AddTemporaryDtors = true;
-    Options.setAllAlwaysAdd();
-
-    std::unique_ptr<CFG> Cfg =
-        CFG::buildCFG(nullptr, Body, Result.Context, Options);
-    assert(Cfg != nullptr);
+    auto CFCtx = llvm::cantFail(
+        ControlFlowContext::build(nullptr, Body, Result.Context));
 
     AnalysisT Analysis(*Result.Context);
     Environment Env;
-    BlockStates = runDataflowAnalysis(*Cfg, Analysis, Env);
+    BlockStates = runDataflowAnalysis(CFCtx, Analysis, Env);
   }
 
   std::vector<
@@ -141,8 +142,175 @@
     }
   )");
   EXPECT_EQ(BlockStates.size(), 4u);
-  EXPECT_FALSE(BlockStates[0].hasValue());
+  EXPECT_TRUE(BlockStates[0].hasValue());
   EXPECT_TRUE(BlockStates[1].hasValue());
   EXPECT_TRUE(BlockStates[2].hasValue());
   EXPECT_TRUE(BlockStates[3].hasValue());
 }
+
+struct FunctionCallLattice {
+  llvm::SmallSet<std::string, 8> CalledFunctions;
+
+  bool operator==(const FunctionCallLattice &Other) const {
+    return CalledFunctions == Other.CalledFunctions;
+  }
+
+  LatticeJoinEffect join(const FunctionCallLattice &Other) {
+    if (Other.CalledFunctions.empty())
+      return LatticeJoinEffect::Unchanged;
+    const size_t size_before = CalledFunctions.size();
+    CalledFunctions.insert(Other.CalledFunctions.begin(),
+                           Other.CalledFunctions.end());
+    return CalledFunctions.size() == size_before ? LatticeJoinEffect::Unchanged
+                                                 : LatticeJoinEffect::Changed;
+  }
+};
+
+std::ostream &operator<<(std::ostream &OS, const FunctionCallLattice &L) {
+  std::string S;
+  llvm::raw_string_ostream ROS(S);
+  llvm::interleaveComma(L.CalledFunctions, ROS);
+  return OS << "{" << S << "}";
+}
+
+class FunctionCallAnalysis
+    : public DataflowAnalysis<FunctionCallAnalysis, FunctionCallLattice> {
+public:
+  explicit FunctionCallAnalysis(ASTContext &Context)
+      : DataflowAnalysis<FunctionCallAnalysis, FunctionCallLattice>(Context) {}
+
+  static FunctionCallLattice initialElement() { return {}; }
+
+  FunctionCallLattice transfer(const Stmt *S, const FunctionCallLattice &E,
+                               Environment &Env) {
+    FunctionCallLattice R = E;
+    if (auto *C = dyn_cast<CallExpr>(S)) {
+      if (auto *F = dyn_cast<FunctionDecl>(C->getCalleeDecl())) {
+        R.CalledFunctions.insert(F->getNameInfo().getAsString());
+      }
+    }
+    return R;
+  }
+};
+
+class NoreturnDestructorTest : public ::testing::Test {
+protected:
+  template <typename Matcher>
+  void runDataflow(llvm::StringRef Code, Matcher Expectations) {
+    tooling::FileContentMappings FilesContents;
+    FilesContents.push_back(std::make_pair<std::string, std::string>(
+        "noreturn_destructor_test_defs.h", R"(
+      int foo();
+
+      class Fatal {
+       public:
+        ~Fatal() __attribute__((noreturn));
+        int bar();
+        int baz();
+      };
+
+      class NonFatal {
+       public:
+        ~NonFatal();
+        int bar();
+      };
+    )"));
+
+    test::checkDataflow<FunctionCallAnalysis>(
+        Code, "target",
+        [](ASTContext &C, Environment &) { return FunctionCallAnalysis(C); },
+        [&Expectations](
+            llvm::ArrayRef<std::pair<
+                std::string, DataflowAnalysisState<FunctionCallLattice>>>
+                Results,
+            ASTContext &) { EXPECT_THAT(Results, Expectations); },
+        {"-fsyntax-only", "-std=c++17"}, FilesContents);
+  }
+};
+
+MATCHER_P(HoldsFunctionCallLattice, m,
+          ((negation ? "doesn't hold" : "holds") +
+           llvm::StringRef(" a lattice element that ") +
+           ::testing::DescribeMatcher<FunctionCallLattice>(m, negation))
+              .str()) {
+  return ExplainMatchResult(m, arg.Lattice, result_listener);
+}
+
+MATCHER_P(HasCalledFunctions, m, "") {
+  return ExplainMatchResult(m, arg.CalledFunctions, result_listener);
+}
+
+TEST_F(NoreturnDestructorTest, ConditionalOperatorBothBranchesReturn) {
+  std::string Code = R"(
+    #include "noreturn_destructor_test_defs.h"
+
+    void target(bool b) {
+      int value = b ? foo() : NonFatal().bar();
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code, UnorderedElementsAre(
+                        Pair("p", HoldsFunctionCallLattice(HasCalledFunctions(
+                                      UnorderedElementsAre("foo", "bar"))))));
+}
+
+TEST_F(NoreturnDestructorTest, ConditionalOperatorLeftBranchReturns) {
+  std::string Code = R"(
+    #include "noreturn_destructor_test_defs.h"
+
+    void target(bool b) {
+      int value = b ? foo() : Fatal().bar();
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code, UnorderedElementsAre(
+                        Pair("p", HoldsFunctionCallLattice(HasCalledFunctions(
+                                      UnorderedElementsAre("foo"))))));
+}
+
+TEST_F(NoreturnDestructorTest, ConditionalOperatorRightBranchReturns) {
+  std::string Code = R"(
+    #include "noreturn_destructor_test_defs.h"
+
+    void target(bool b) {
+      int value = b ? Fatal().bar() : foo();
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code, UnorderedElementsAre(
+                        Pair("p", HoldsFunctionCallLattice(HasCalledFunctions(
+                                      UnorderedElementsAre("foo"))))));
+}
+
+TEST_F(NoreturnDestructorTest, ConditionalOperatorNestedBranchesDoNotReturn) {
+  std::string Code = R"(
+    #include "noreturn_destructor_test_defs.h"
+
+    void target(bool b1, bool b2) {
+      int value = b1 ? foo() : (b2 ? Fatal().bar() : Fatal().baz());
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code, IsEmpty());
+  // FIXME: Called functions at point `p` should contain "foo".
+}
+
+TEST_F(NoreturnDestructorTest, ConditionalOperatorNestedBranchReturns) {
+  std::string Code = R"(
+    #include "noreturn_destructor_test_defs.h"
+
+    void target(bool b1, bool b2) {
+      int value = b1 ? Fatal().bar() : (b2 ? Fatal().baz() : foo());
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code, UnorderedElementsAre(
+                        Pair("p", HoldsFunctionCallLattice(HasCalledFunctions(
+                                      UnorderedElementsAre("baz", "foo"))))));
+  // FIXME: Called functions at point `p` should contain only "foo".
+}
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -21584,6 +21584,70 @@
     Style.IndentWidth = 7;
     return Style;
   }());
+
+  // Test 9.9: use inheritance from a specific config file.
+  Style9 = getStyle("file:/e/sub/sub/.clang-format", "/e/sub/sub/code.cpp",
+                    "none", "", &FS);
+  ASSERT_TRUE(static_cast<bool>(Style9));
+  ASSERT_EQ(*Style9, SubSubStyle);
+}
+
+TEST(FormatStyle, GetStyleOfSpecificFile) {
+  llvm::vfs::InMemoryFileSystem FS;
+  // Specify absolute path to a format file in a parent directory.
+  ASSERT_TRUE(
+      FS.addFile("/e/.clang-format", 0,
+                 llvm::MemoryBuffer::getMemBuffer("BasedOnStyle: LLVM")));
+  ASSERT_TRUE(
+      FS.addFile("/e/explicit.clang-format", 0,
+                 llvm::MemoryBuffer::getMemBuffer("BasedOnStyle: Google")));
+  ASSERT_TRUE(FS.addFile("/e/sub/sub/sub/test.cpp", 0,
+                         llvm::MemoryBuffer::getMemBuffer("int i;")));
+  auto Style = getStyle("file:/e/explicit.clang-format",
+                        "/e/sub/sub/sub/test.cpp", "LLVM", "", &FS);
+  ASSERT_TRUE(static_cast<bool>(Style));
+  ASSERT_EQ(*Style, getGoogleStyle());
+
+  // Specify relative path to a format file.
+  ASSERT_TRUE(
+      FS.addFile("../../e/explicit.clang-format", 0,
+                 llvm::MemoryBuffer::getMemBuffer("BasedOnStyle: Google")));
+  Style = getStyle("file:../../e/explicit.clang-format",
+                   "/e/sub/sub/sub/test.cpp", "LLVM", "", &FS);
+  ASSERT_TRUE(static_cast<bool>(Style));
+  ASSERT_EQ(*Style, getGoogleStyle());
+
+  // Specify path to a format file that does not exist.
+  Style = getStyle("file:/e/missing.clang-format", "/e/sub/sub/sub/test.cpp",
+                   "LLVM", "", &FS);
+  ASSERT_FALSE(static_cast<bool>(Style));
+  llvm::consumeError(Style.takeError());
+
+  // Specify path to a file on the filesystem.
+  SmallString<128> FormatFilePath;
+  std::error_code ECF = llvm::sys::fs::createTemporaryFile(
+      "FormatFileTest", "tpl", FormatFilePath);
+  EXPECT_FALSE((bool)ECF);
+  llvm::raw_fd_ostream FormatFileTest(FormatFilePath, ECF);
+  EXPECT_FALSE((bool)ECF);
+  FormatFileTest << "BasedOnStyle: Google\n";
+  FormatFileTest.close();
+
+  SmallString<128> TestFilePath;
+  std::error_code ECT =
+      llvm::sys::fs::createTemporaryFile("CodeFileTest", "cc", TestFilePath);
+  EXPECT_FALSE((bool)ECT);
+  llvm::raw_fd_ostream CodeFileTest(TestFilePath, ECT);
+  CodeFileTest << "int i;\n";
+  CodeFileTest.close();
+
+  std::string format_file_arg = std::string("file:") + FormatFilePath.c_str();
+  Style = getStyle(format_file_arg, TestFilePath, "LLVM", "", nullptr);
+
+  llvm::sys::fs::remove(FormatFilePath.c_str());
+  llvm::sys::fs::remove(TestFilePath.c_str());
+  ASSERT_TRUE(static_cast<bool>(Style));
+  ASSERT_EQ(*Style, getGoogleStyle());
 }
 
 TEST_F(ReplacementTest, FormatCodeAfterReplacements) {
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -125,11 +125,9 @@
 static int WriteBinaryIdForNote(ProfDataWriter *Writer,
                                 const ElfW(Nhdr) * Note) {
   int BinaryIdSize = 0;
-
   const char *NoteName = (const char *)Note + sizeof(ElfW(Nhdr));
   if (Note->n_type == NT_GNU_BUILD_ID && Note->n_namesz == 4 &&
       memcmp(NoteName, "GNU\0", 4) == 0) {
-
     uint64_t BinaryIdLen = Note->n_descsz;
     const uint8_t *BinaryIdData =
         (const uint8_t *)(NoteName + RoundUp(Note->n_namesz, 4));
@@ -151,12 +149,12 @@
  */
 static int WriteBinaryIds(ProfDataWriter *Writer, const ElfW(Nhdr) * Note,
                           const ElfW(Nhdr) * NotesEnd) {
-  int TotalBinaryIdsSize = 0;
+  int BinaryIdsSize = 0;
   while (Note < NotesEnd) {
-    int Result = WriteBinaryIdForNote(Writer, Note);
-    if (Result == -1)
+    int OneBinaryIdSize = WriteBinaryIdForNote(Writer, Note);
+    if (OneBinaryIdSize == -1)
       return -1;
-    TotalBinaryIdsSize += Result;
+    BinaryIdsSize += OneBinaryIdSize;
 
     /* Calculate the offset of the next note in notes section. */
     size_t NoteOffset = sizeof(ElfW(Nhdr)) + RoundUp(Note->n_namesz, 4) +
@@ -164,7 +162,7 @@
     Note = (const ElfW(Nhdr) *)((const char *)(Note) + NoteOffset);
   }
 
-  return TotalBinaryIdsSize;
+  return BinaryIdsSize;
 }
 
 /*
@@ -178,21 +176,46 @@
   const ElfW(Phdr) *ProgramHeader =
       (const ElfW(Phdr) *)((uintptr_t)ElfHeader + ElfHeader->e_phoff);
 
+  int TotalBinaryIdsSize = 0;
   uint32_t I;
   /* Iterate through entries in the program header. */
   for (I = 0; I < ElfHeader->e_phnum; I++) {
-    /* Look for the notes section in program header entries. */
+    /* Look for the notes segment in program header entries. */
     if (ProgramHeader[I].p_type != PT_NOTE)
       continue;
 
-    const ElfW(Nhdr) *Note =
-        (const ElfW(Nhdr) *)((uintptr_t)ElfHeader + ProgramHeader[I].p_offset);
-    const ElfW(Nhdr) *NotesEnd =
-        (const ElfW(Nhdr) *)((const char *)(Note) + ProgramHeader[I].p_filesz);
-    return WriteBinaryIds(Writer, Note, NotesEnd);
+    /* There can be multiple notes segment, and examine each of them. */
+    const ElfW(Nhdr) * Note;
+    const ElfW(Nhdr) * NotesEnd;
+    /*
+     * When examining notes in file, use p_offset, which is the offset within
+     * the elf file, to find the start of notes.
+     */
+    if (ProgramHeader[I].p_memsz == 0 ||
+        ProgramHeader[I].p_memsz == ProgramHeader[I].p_filesz) {
+      Note = (const ElfW(Nhdr) *)((uintptr_t)ElfHeader +
+                                  ProgramHeader[I].p_offset);
+      NotesEnd = (const ElfW(Nhdr) *)((const char *)(Note) +
+                                      ProgramHeader[I].p_filesz);
+    } else {
+      /*
+       * When examining notes in memory, use p_vaddr, which is the address of
+       * section after loaded to memory, to find the start of notes.
+       */
+      Note =
+          (const ElfW(Nhdr) *)((uintptr_t)ElfHeader + ProgramHeader[I].p_vaddr);
+      NotesEnd =
+          (const ElfW(Nhdr) *)((const char *)(Note) + ProgramHeader[I].p_memsz);
+    }
+
+    int BinaryIdsSize = WriteBinaryIds(Writer, Note, NotesEnd);
+    if (TotalBinaryIdsSize == -1)
+      return -1;
+
+    TotalBinaryIdsSize += BinaryIdsSize;
   }
 
-  return 0;
+  return TotalBinaryIdsSize;
 }
 #else /* !NT_GNU_BUILD_ID */
 /*
diff --git a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
--- a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
@@ -1,33 +1,12 @@
 // REQUIRES: zlib
 
 // Value profiling is currently not supported in lightweight mode.
-// RUN: %clang_pgogen -o %t.normal -mllvm --disable-vp=true %s
-// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
-// RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
-
-// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %s
+// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t.dSYM %t.proflite
 
-// RUN: diff %t.normal.profdata %t.profdata
-
-int foo(int a) {
-  if (a % 2)
-    return 4 * a + 1;
-  return 0;
-}
-
-int bar(int a) {
-  while (a > 100)
-    a /= 2;
-  return a;
-}
-
-typedef int (*FP)(int);
-FP Fps[3] = {foo, bar};
+// RUN: %clang_pgogen -o %t.normal -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
+// RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
 
-int main() {
-  for (int i = 0; i < 5; i++)
-    Fps[i % 2](i);
-  return 0;
-}
+// RUN: diff %t.normal.profdata %t.profdata
diff --git a/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-bar.h b/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-bar.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-bar.h
@@ -0,0 +1,7 @@
+int foo(int);
+
+inline int bar(int a) {
+  while (a > 100)
+    a /= 2;
+  return a;
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-foo.cpp b/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-foo.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-foo.cpp
@@ -0,0 +1,7 @@
+#include "instrprof-debug-info-correlate-bar.h"
+
+int foo(int a) {
+  if (a % 2)
+    return 4 * a + 1;
+  return bar(a);
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-main.cpp b/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-main.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-debug-info-correlate-main.cpp
@@ -0,0 +1,10 @@
+#include "instrprof-debug-info-correlate-bar.h"
+
+typedef int (*FP)(int);
+FP Fps[2] = {foo, bar};
+
+int main() {
+  for (int i = 0; i < 5; i++)
+    Fps[i % 2](i);
+  return 0;
+}
diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
--- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
@@ -1,33 +1,12 @@
 // REQUIRES: zlib
 
 // Value profiling is currently not supported in lightweight mode.
-// RUN: %clang_pgogen -o %t.normal -mllvm --disable-vp=true %s
-// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
-// RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
-
-// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %s
+// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite
 
-// RUN: diff %t.normal.profdata %t.profdata
-
-int foo(int a) {
-  if (a % 2)
-    return 4 * a + 1;
-  return 0;
-}
-
-int bar(int a) {
-  while (a > 100)
-    a /= 2;
-  return a;
-}
-
-typedef int (*FP)(int);
-FP Fps[3] = {foo, bar};
+// RUN: %clang_pgogen -o %t.normal -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
+// RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
 
-int main() {
-  for (int i = 0; i < 5; i++)
-    Fps[i % 2](i);
-  return 0;
-}
+// RUN: diff %t.normal.profdata %t.profdata
diff --git a/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp b/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp
--- a/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp
+++ b/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp
@@ -9,8 +9,6 @@
 
 auto Identifier = mlir::Identifier::get("foo", &Context);
 mlir::OperationName OperationName("FooOp", &Context);
-mlir::Value Value({reinterpret_cast<void *>(0x8),
-                   mlir::Value::Kind::TrailingOpResult});
 
 mlir::Type Type(nullptr);
 mlir::Type IndexType = mlir::IndexType::get(&Context);
@@ -23,6 +21,10 @@
 mlir::Type TupleType =
     mlir::TupleType::get(&Context, mlir::TypeRange({IndexType, FloatType}));
 
+
+mlir::detail::OutOfLineOpResult Result(FloatType, 42);
+mlir::Value Value(&Result);
+
 auto UnknownLoc = mlir::UnknownLoc::get(&Context);
 auto FileLineColLoc = mlir::FileLineColLoc::get(&Context, "file", 7, 8);
 auto OpaqueLoc = mlir::OpaqueLoc::get<uintptr_t>(9, &Context);
diff --git a/libcxx/docs/Status/Cxx2bIssues.csv b/libcxx/docs/Status/Cxx2bIssues.csv
--- a/libcxx/docs/Status/Cxx2bIssues.csv
+++ b/libcxx/docs/Status/Cxx2bIssues.csv
@@ -37,7 +37,7 @@
 "`3449 <https://wg21.link/LWG3449>`__","``take_view`` and ``take_while_view``'s ``sentinel<false>`` not comparable with their ``const iterator``","November 2020","","","|ranges|"
 "`3453 <https://wg21.link/LWG3453>`__","Generic code cannot call ``ranges::advance(i, s)``","November 2020","","","|ranges|"
 "`3455 <https://wg21.link/LWG3455>`__","Incorrect Postconditions on ``unique_ptr`` move assignment","November 2020","|Nothing To Do|",""
-"`3460 <https://wg21.link/LWG3460>`__","Unimplementable ``noop_coroutine_handle`` guarantees","November 2020","",""
+"`3460 <https://wg21.link/LWG3460>`__","Unimplementable ``noop_coroutine_handle`` guarantees","November 2020","|Complete|","14.0"
 "`3461 <https://wg21.link/LWG3461>`__","``convertible_to``'s description mishandles cv-qualified ``void``","November 2020","",""
 "`3465 <https://wg21.link/LWG3465>`__","``compare_partial_order_fallback`` requires ``F < E``","November 2020","","","|spaceship|"
 "`3466 <https://wg21.link/LWG3466>`__","Specify the requirements for ``promise``/``future``/``shared_future`` consistently","November 2020","",""
diff --git a/libcxx/include/__algorithm/make_heap.h b/libcxx/include/__algorithm/make_heap.h
--- a/libcxx/include/__algorithm/make_heap.h
+++ b/libcxx/include/__algorithm/make_heap.h
@@ -32,7 +32,7 @@
         // start from the first parent, there is no need to consider children
         for (difference_type __start = (__n - 2) / 2; __start >= 0; --__start)
         {
-            _VSTD::__sift_down<_Compare>(__first, __last, __comp, __n, __first + __start);
+            _VSTD::__sift_down<_Compare>(__first, __comp, __n, __first + __start);
         }
     }
 }
diff --git a/libcxx/include/__algorithm/partial_sort.h b/libcxx/include/__algorithm/partial_sort.h
--- a/libcxx/include/__algorithm/partial_sort.h
+++ b/libcxx/include/__algorithm/partial_sort.h
@@ -40,7 +40,7 @@
         if (__comp(*__i, *__first))
         {
             swap(*__i, *__first);
-            _VSTD::__sift_down<_Compare>(__first, __middle, __comp, __len, __first);
+            _VSTD::__sift_down<_Compare>(__first, __comp, __len, __first);
         }
     }
     _VSTD::__sort_heap<_Compare>(__first, __middle, __comp);
diff --git a/libcxx/include/__algorithm/partial_sort_copy.h b/libcxx/include/__algorithm/partial_sort_copy.h
--- a/libcxx/include/__algorithm/partial_sort_copy.h
+++ b/libcxx/include/__algorithm/partial_sort_copy.h
@@ -40,7 +40,7 @@
             if (__comp(*__first, *__result_first))
             {
                 *__result_first = *__first;
-                _VSTD::__sift_down<_Compare>(__result_first, __r, __comp, __len, __result_first);
+                _VSTD::__sift_down<_Compare>(__result_first, __comp, __len, __result_first);
             }
         _VSTD::__sort_heap<_Compare>(__result_first, __r, __comp);
     }
diff --git a/libcxx/include/__algorithm/pop_heap.h b/libcxx/include/__algorithm/pop_heap.h
--- a/libcxx/include/__algorithm/pop_heap.h
+++ b/libcxx/include/__algorithm/pop_heap.h
@@ -31,7 +31,7 @@
     if (__len > 1)
     {
         swap(*__first, *--__last);
-        _VSTD::__sift_down<_Compare>(__first, __last, __comp, __len - 1, __first);
+        _VSTD::__sift_down<_Compare>(__first, __comp, __len - 1, __first);
     }
 }
 
diff --git a/libcxx/include/__algorithm/sift_down.h b/libcxx/include/__algorithm/sift_down.h
--- a/libcxx/include/__algorithm/sift_down.h
+++ b/libcxx/include/__algorithm/sift_down.h
@@ -21,8 +21,7 @@
 
 template <class _Compare, class _RandomAccessIterator>
 _LIBCPP_CONSTEXPR_AFTER_CXX11 void
-__sift_down(_RandomAccessIterator __first, _RandomAccessIterator /*__last*/,
-            _Compare __comp,
+__sift_down(_RandomAccessIterator __first, _Compare __comp,
             typename iterator_traits<_RandomAccessIterator>::difference_type __len,
             _RandomAccessIterator __start)
 {
@@ -46,7 +45,7 @@
 
     // check if we are in heap-order
     if (__comp(*__child_i, *__start))
-        // we are, __start is larger than it's largest child
+        // we are, __start is larger than its largest child
         return;
 
     value_type __top(_VSTD::move(*__start));
diff --git a/libcxx/include/__coroutine/noop_coroutine_handle.h b/libcxx/include/__coroutine/noop_coroutine_handle.h
--- a/libcxx/include/__coroutine/noop_coroutine_handle.h
+++ b/libcxx/include/__coroutine/noop_coroutine_handle.h
@@ -20,7 +20,8 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if __has_builtin(__builtin_coro_noop)
+#if __has_builtin(__builtin_coro_noop) || defined(_LIBCPP_COMPILER_GCC)
+
 // [coroutine.noop]
 // [coroutine.promise.noop]
 struct noop_coroutine_promise {};
@@ -64,20 +65,45 @@
     _LIBCPP_HIDE_FROM_ABI
     friend coroutine_handle<noop_coroutine_promise> noop_coroutine() noexcept;
 
+#if __has_builtin(__builtin_coro_noop)
     _LIBCPP_HIDE_FROM_ABI coroutine_handle() noexcept { 
         this->__handle_ = __builtin_coro_noop();
     }
 
     void* __handle_ = nullptr;
+
+#elif defined(_LIBCPP_COMPILER_GCC)
+    // GCC doesn't implement __builtin_coro_noop().
+    // Construct the coroutine frame manually instead.
+    struct __noop_coroutine_frame_ty_ {
+        static void __dummy_resume_destroy_func() { }
+
+        void (*__resume_)() = __dummy_resume_destroy_func;
+        void (*__destroy_)() = __dummy_resume_destroy_func;
+        struct noop_coroutine_promise __promise_;
+    };
+
+    static __noop_coroutine_frame_ty_ __noop_coroutine_frame_;
+
+    void* __handle_ = &__noop_coroutine_frame_;
+
+    _LIBCPP_HIDE_FROM_ABI coroutine_handle() noexcept = default;
+
+#endif // __has_builtin(__builtin_coro_noop)
 };
 
 using noop_coroutine_handle = coroutine_handle<noop_coroutine_promise>;
 
+#if defined(_LIBCPP_COMPILER_GCC)
+inline noop_coroutine_handle::__noop_coroutine_frame_ty_
+    noop_coroutine_handle::__noop_coroutine_frame_{};
+#endif
+
 // [coroutine.noop.coroutine]
 inline _LIBCPP_HIDE_FROM_ABI
 noop_coroutine_handle noop_coroutine() noexcept { return noop_coroutine_handle(); }
 
-#endif // __has_builtin(__builtin_coro_noop)
+#endif // __has_builtin(__builtin_coro_noop) || defined(_LIBCPP_COMPILER_GCC)
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap.pass.cpp
--- a/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap.pass.cpp
@@ -22,8 +22,6 @@
 
 using IterSwapT = decltype(std::ranges::iter_swap);
 
-static_assert(std::semiregular<std::remove_cv_t<IterSwapT>>);
-
 struct HasIterSwap {
   int &value_;
   explicit HasIterSwap(int &value) : value_(value) { assert(value == 0); }
diff --git a/libcxx/test/std/language.support/support.coroutines/coroutine.handle/coroutine.handle.noop/noop_coroutine.pass.cpp b/libcxx/test/std/language.support/support.coroutines/coroutine.handle/coroutine.handle.noop/noop_coroutine.pass.cpp
--- a/libcxx/test/std/language.support/support.coroutines/coroutine.handle/coroutine.handle.noop/noop_coroutine.pass.cpp
+++ b/libcxx/test/std/language.support/support.coroutines/coroutine.handle/coroutine.handle.noop/noop_coroutine.pass.cpp
@@ -21,8 +21,6 @@
 
 #include "test_macros.h"
 
-#if __has_builtin(__builtin_coro_noop)
-
 static_assert(std::is_same<std::coroutine_handle<std::noop_coroutine_promise>, std::noop_coroutine_handle>::value, "");
 static_assert(std::is_same<decltype(std::noop_coroutine()), std::noop_coroutine_handle>::value, "");
 
@@ -57,20 +55,25 @@
   h.resume();
   h.destroy();
   h();
-  static_assert(h.done() == false, "");
   static_assert(h, "");
+  static_assert(h.done() == false, "");
+
+  // [coroutine.handle.noop.resumption]p2
+  // Remarks: If noop_­coroutine_­handle is converted to
+  // coroutine_­handle<>, calls to operator(), resume and
+  // destroy on that handle will also have no observable
+  // effects.
+  base.resume();
+  base.destroy();
+  base();
+  assert(base);
+  assert(base.done() == false);
 
   h.promise();
   assert(h.address() == base.address());
-  assert(h==base);
+  assert(h == base);
   assert(h.address() != nullptr);
   assert(std::coroutine_handle<>::from_address(h.address()) == base);
 
   return 0;
 }
-
-#else
-
-int main(int, char**) { return 0; }
-
-#endif //  __has_builtin(__builtin_coro_noop)
diff --git a/libcxx/test/std/ranges/range.access/size.pass.cpp b/libcxx/test/std/ranges/range.access/size.pass.cpp
--- a/libcxx/test/std/ranges/range.access/size.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/size.pass.cpp
@@ -36,8 +36,6 @@
 static_assert(std::ranges::size(std::as_const(array_of_incomplete)) == 42);
 static_assert(std::ranges::size(static_cast<const Incomplete(&&)[42]>(array_of_incomplete)) == 42);
 
-static_assert(std::semiregular<std::remove_cv_t<RangeSizeT>>);
-
 struct SizeMember {
   constexpr size_t size() { return 42; }
 };
diff --git a/libcxx/test/std/ranges/range.access/ssize.pass.cpp b/libcxx/test/std/ranges/range.access/ssize.pass.cpp
--- a/libcxx/test/std/ranges/range.access/ssize.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/ssize.pass.cpp
@@ -25,8 +25,6 @@
 static_assert( std::is_invocable_v<RangeSSizeT, int (&&)[1]>);
 static_assert( std::is_invocable_v<RangeSSizeT, int (&)[1]>);
 
-static_assert(std::semiregular<std::remove_cv_t<RangeSSizeT>>);
-
 struct SizeMember {
   constexpr size_t size() { return 42; }
 };
diff --git a/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp
--- a/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp
@@ -48,9 +48,6 @@
   {
     static_assert(std::addressof(std::views::counted) == std::addressof(std::ranges::views::counted));
 
-    auto copy = std::views::counted;
-    static_assert(std::semiregular<decltype(copy)>);
-
     static_assert( CountedInvocable<int*, size_t>);
     static_assert(!CountedInvocable<int*, LvalueConvertible>);
     static_assert( CountedInvocable<int*, LvalueConvertible&>);
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp
--- a/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp
@@ -43,11 +43,6 @@
     ASSERT_SAME_TYPE(decltype(std::views::iota(T(10))), std::ranges::iota_view<T>);
     ASSERT_SAME_TYPE(decltype(std::views::iota(T(10), u)), std::ranges::iota_view<T, U>);
   }
-  // Test that this is semiregular.
-  // Note: we cannot test perfect forwarding because both T and U must be copyable.
-  {
-    static_assert(std::semiregular<std::remove_const_t<decltype(std::views::iota)>>);
-  }
 }
 
 struct X {};
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -304,7 +304,8 @@
   JmpInsnOpcode jInvert = invertJmpOpcode(jmpOpcodeB);
   if (jInvert == J_UNKNOWN)
     return false;
-  is.jumpInstrMods.push_back({jInvert, (rB.offset - 1), 4});
+  is.jumpInstrMod = make<JumpInstrMod>();
+  *is.jumpInstrMod = {rB.offset - 1, jInvert, 4};
   // Move R's values to rB except the offset.
   rB = {r.expr, r.type, rB.offset, r.addend, r.sym};
   // Cancel R
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -97,9 +97,10 @@
     whyExtract.clear();
 
     tar = nullptr;
-    memset(&in, 0, sizeof(in));
+    in.reset();
 
-    partitions = {Partition()};
+    partitions.clear();
+    partitions.emplace_back();
 
     SharedFile::vernauxNum = 0;
   };
@@ -116,7 +117,8 @@
   script = std::make_unique<LinkerScript>();
   symtab = std::make_unique<SymbolTable>();
 
-  partitions = {Partition()};
+  partitions.clear();
+  partitions.emplace_back();
 
   config->progName = args[0];
 
@@ -2369,12 +2371,6 @@
   // except a few linker-synthesized ones will be added to the symbol table.
   compileBitcodeFiles<ELFT>();
 
-  // Handle --exclude-libs again because lto.tmp may reference additional
-  // libcalls symbols defined in an excluded archive. This may override
-  // versionId set by scanVersionScript().
-  if (args.hasArg(OPT_exclude_libs))
-    excludeLibs(args);
-
   // Symbol resolution finished. Report backward reference problems.
   reportBackrefs();
   if (errorCount())
@@ -2392,9 +2388,18 @@
       !config->thinLTOModulesToCompile.empty())
     return;
 
+  // Handle --exclude-libs again because lto.tmp may reference additional
+  // libcalls symbols defined in an excluded archive. This may override
+  // versionId set by scanVersionScript().
+  if (args.hasArg(OPT_exclude_libs))
+    excludeLibs(args);
+
   // Apply symbol renames for --wrap and combine foo@v1 and foo@@v1.
   redirectSymbols(wrapped);
 
+  // Replace common symbols with regular symbols.
+  replaceCommonSymbols();
+
   {
     llvm::TimeTraceScope timeScope("Aggregate sections");
     // Now that we have a complete list of input files.
@@ -2479,9 +2484,6 @@
   if (!config->relocatable)
     inputSections.push_back(createCommentSection());
 
-  // Replace common symbols with regular symbols.
-  replaceCommonSymbols();
-
   // Split SHF_MERGE and .eh_frame sections into pieces in preparation for garbage collection.
   splitSections<ELFT>();
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -66,7 +66,6 @@
   enum Kind : uint8_t {
     ObjKind,
     SharedKind,
-    LazyObjKind,
     ArchiveKind,
     BitcodeKind,
     BinaryKind,
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -879,8 +879,8 @@
       // to work. In a full implementation we would merge all attribute
       // sections.
       if (in.attributes == nullptr) {
-        in.attributes = make<InputSection>(*this, sec, name);
-        return in.attributes;
+        in.attributes = std::make_unique<InputSection>(*this, sec, name);
+        return in.attributes.get();
       }
       return &InputSection::discarded;
     }
@@ -901,8 +901,8 @@
       // standard extensions to enable. In a full implementation we would merge
       // all attribute sections.
       if (in.attributes == nullptr) {
-        in.attributes = make<InputSection>(*this, sec, name);
-        return in.attributes;
+        in.attributes = std::make_unique<InputSection>(*this, sec, name);
+        return in.attributes.get();
       }
       return &InputSection::discarded;
     }
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -130,13 +130,16 @@
   // one or two jump instructions at the end that could be relaxed to a smaller
   // instruction. The members below help trimming the trailing jump instruction
   // and shrinking a section.
-  unsigned bytesDropped = 0;
+  uint8_t bytesDropped = 0;
 
   // Whether the section needs to be padded with a NOP filler due to
   // deleteFallThruJmpInsn.
   bool nopFiller = false;
 
-  void drop_back(uint64_t num) { bytesDropped += num; }
+  void drop_back(unsigned num) {
+    assert(bytesDropped + num < 256);
+    bytesDropped += num;
+  }
 
   void push_back(uint64_t num) {
     assert(bytesDropped >= num);
@@ -203,7 +206,7 @@
   // block sections are enabled.  Basic block sections creates opportunities to
   // relax jump instructions at basic block boundaries after reordering the
   // basic blocks.
-  SmallVector<JumpInstrMod, 0> jumpInstrMods;
+  JumpInstrMod *jumpInstrMod = nullptr;
 
   // A function compiled with -fsplit-stack calling a function
   // compiled without -fsplit-stack needs its prologue adjusted. Find
@@ -321,7 +324,7 @@
 
   // Splittable sections are handled as a sequence of data
   // rather than a single large blob of data.
-  std::vector<EhSectionPiece> pieces;
+  SmallVector<EhSectionPiece, 0> pieces;
 
   SyntheticSection *getParent() const;
 };
@@ -376,11 +379,7 @@
   template <class ELFT> void copyShtGroup(uint8_t *buf);
 };
 
-#ifdef _WIN32
-static_assert(sizeof(InputSection) <= 184, "InputSection is too big");
-#else
-static_assert(sizeof(InputSection) <= 176, "InputSection is too big");
-#endif
+static_assert(sizeof(InputSection) <= 160, "InputSection is too big");
 
 inline bool isDebugSection(const InputSectionBase &sec) {
   return (sec.flags & llvm::ELF::SHF_ALLOC) == 0 &&
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -1106,12 +1106,9 @@
   // a jmp insn must be modified to shrink the jmp insn or to flip the jmp
   // insn.  This is primarily used to relax and optimize jumps created with
   // basic block sections.
-  if (isa<InputSection>(this)) {
-    for (const JumpInstrMod &jumpMod : jumpInstrMods) {
-      uint64_t offset = jumpMod.offset;
-      uint8_t *bufLoc = buf + offset;
-      target.applyJumpInstrMod(bufLoc, jumpMod.original, jumpMod.size);
-    }
+  if (jumpInstrMod) {
+    target.applyJumpInstrMod(buf + jumpInstrMod->offset, jumpInstrMod->original,
+                             jumpInstrMod->size);
   }
 }
 
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -561,7 +561,7 @@
 }
 
 void LinkerScript::discard(InputSectionBase &s) {
-  if (&s == in.shStrTab || &s == mainPart->relrDyn)
+  if (&s == in.shStrTab.get() || &s == mainPart->relrDyn.get())
     error("discarding " + s.name + " section is not allowed");
 
   // You can discard .hash and .gnu.hash sections by linker scripts. Since
@@ -829,8 +829,7 @@
   StringMap<TinyPtrVector<OutputSection *>> map;
   SmallVector<OutputSection *, 0> v;
 
-  std::function<void(InputSectionBase *)> add;
-  add = [&](InputSectionBase *s) {
+  auto add = [&](InputSectionBase *s) {
     if (s->isLive() && !s->parent) {
       orphanSections.push_back(s);
 
@@ -846,11 +845,6 @@
                s->getOutputSection()->sectionIndex == UINT32_MAX);
       }
     }
-
-    if (config->relocatable)
-      for (InputSectionBase *depSec : s->dependentSections)
-        if (depSec->flags & SHF_LINK_ORDER)
-          add(depSec);
   };
 
   // For further --emit-reloc handling code we need target output section
@@ -869,6 +863,10 @@
         if (auto *relIS = dyn_cast_or_null<InputSectionBase>(rel->parent))
           add(relIS);
     add(isec);
+    if (config->relocatable)
+      for (InputSectionBase *depSec : isec->dependentSections)
+        if (depSec->flags & SHF_LINK_ORDER)
+          add(depSec);
   }
 
   // If no SECTIONS command was given, we should insert sections commands
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -177,11 +177,12 @@
     // SHT_NOTE sections in a group are subject to garbage collection.
     return !sec->nextInSectionGroup;
   default:
-    // Support SHT_PROGBITS .init_array for a while
-    // (https://golang.org/issue/50295).
+    // Support SHT_PROGBITS .init_array (https://golang.org/issue/50295) and
+    // .init_array.N (https://github.com/rust-lang/rust/issues/92181) for a
+    // while.
     StringRef s = sec->name;
-    return s == ".init" || s == ".fini" || s == ".init_array" || s == ".jcr" ||
-           s.startswith(".ctors") || s.startswith(".dtors");
+    return s == ".init" || s == ".fini" || s.startswith(".init_array") ||
+           s == ".jcr" || s.startswith(".ctors") || s.startswith(".dtors");
   }
 }
 
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -117,8 +117,8 @@
 // jump instruction opcodes at basic block boundaries and are particularly
 // useful when basic block sections are enabled.
 struct JumpInstrMod {
-  JumpModType original;
   uint64_t offset;
+  JumpModType original;
   unsigned size;
 };
 
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -870,7 +870,7 @@
 
   // If preemptible, emit a GLOB_DAT relocation.
   if (sym.isPreemptible) {
-    mainPart->relaDyn->addReloc({target->gotRel, in.got, off,
+    mainPart->relaDyn->addReloc({target->gotRel, in.got.get(), off,
                                  DynamicReloc::AgainstSymbol, sym, 0, R_ABS});
     return;
   }
@@ -1551,7 +1551,7 @@
   if (sym.hasDirectReloc) {
     // Change the value to the IPLT and redirect all references to it.
     auto &d = cast<Defined>(sym);
-    d.section = in.iplt;
+    d.section = in.iplt.get();
     d.value = sym.pltIndex * target->ipltEntrySize;
     d.size = 0;
     // It's important to set the symbol type here so that dynamic loaders
@@ -1641,8 +1641,8 @@
         in.got->relocations.push_back(
             {R_ADDEND, target->symbolicRel, in.got->getTlsIndexOff(), 1, &sym});
       else
-        mainPart->relaDyn->addReloc(
-            {target->tlsModuleIndexRel, in.got, in.got->getTlsIndexOff()});
+        mainPart->relaDyn->addReloc({target->tlsModuleIndexRel, in.got.get(),
+                                     in.got->getTlsIndexOff()});
     }
     if (sym.needsGotDtprel) {
       in.got->addEntry(sym);
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -34,7 +34,6 @@
 class Defined;
 struct PhdrEntry;
 class SymbolTableBaseSection;
-class VersionNeedBaseSection;
 
 class SyntheticSection : public InputSection {
 public:
@@ -1203,24 +1202,24 @@
   StringRef name;
   uint64_t nameStrTab;
 
-  SyntheticSection *elfHeader;
-  SyntheticSection *programHeaders;
+  std::unique_ptr<SyntheticSection> elfHeader;
+  std::unique_ptr<SyntheticSection> programHeaders;
   SmallVector<PhdrEntry *, 0> phdrs;
 
-  ARMExidxSyntheticSection *armExidx;
-  BuildIdSection *buildId;
-  SyntheticSection *dynamic;
-  StringTableSection *dynStrTab;
-  SymbolTableBaseSection *dynSymTab;
-  EhFrameHeader *ehFrameHdr;
-  EhFrameSection *ehFrame;
+  std::unique_ptr<ARMExidxSyntheticSection> armExidx;
+  std::unique_ptr<BuildIdSection> buildId;
+  std::unique_ptr<SyntheticSection> dynamic;
+  std::unique_ptr<StringTableSection> dynStrTab;
+  std::unique_ptr<SymbolTableBaseSection> dynSymTab;
+  std::unique_ptr<EhFrameHeader> ehFrameHdr;
+  std::unique_ptr<EhFrameSection> ehFrame;
   GnuHashTableSection *gnuHashTab;
   HashTableSection *hashTab;
-  RelocationBaseSection *relaDyn;
-  RelrBaseSection *relrDyn;
-  VersionDefinitionSection *verDef;
-  SyntheticSection *verNeed;
-  VersionTableSection *verSym;
+  std::unique_ptr<RelocationBaseSection> relaDyn;
+  std::unique_ptr<RelrBaseSection> relrDyn;
+  std::unique_ptr<VersionDefinitionSection> verDef;
+  std::unique_ptr<SyntheticSection> verNeed;
+  std::unique_ptr<VersionTableSection> verSym;
 
   unsigned getNumber() const { return this - &partitions[0] + 1; }
 };
@@ -1235,27 +1234,29 @@
 // Linker generated sections which can be used as inputs and are not specific to
 // a partition.
 struct InStruct {
-  InputSection *attributes;
-  BssSection *bss;
-  BssSection *bssRelRo;
-  GotSection *got;
-  GotPltSection *gotPlt;
-  IgotPltSection *igotPlt;
-  PPC64LongBranchTargetSection *ppc64LongBranchTarget;
-  MipsGotSection *mipsGot;
-  MipsRldMapSection *mipsRldMap;
-  SyntheticSection *partEnd;
-  SyntheticSection *partIndex;
-  PltSection *plt;
-  IpltSection *iplt;
-  PPC32Got2Section *ppc32Got2;
-  IBTPltSection *ibtPlt;
-  RelocationBaseSection *relaPlt;
-  RelocationBaseSection *relaIplt;
-  StringTableSection *shStrTab;
-  StringTableSection *strTab;
-  SymbolTableBaseSection *symTab;
-  SymtabShndxSection *symTabShndx;
+  std::unique_ptr<InputSection> attributes;
+  std::unique_ptr<BssSection> bss;
+  std::unique_ptr<BssSection> bssRelRo;
+  std::unique_ptr<GotSection> got;
+  std::unique_ptr<GotPltSection> gotPlt;
+  std::unique_ptr<IgotPltSection> igotPlt;
+  std::unique_ptr<PPC64LongBranchTargetSection> ppc64LongBranchTarget;
+  std::unique_ptr<MipsGotSection> mipsGot;
+  std::unique_ptr<MipsRldMapSection> mipsRldMap;
+  std::unique_ptr<SyntheticSection> partEnd;
+  std::unique_ptr<SyntheticSection> partIndex;
+  std::unique_ptr<PltSection> plt;
+  std::unique_ptr<IpltSection> iplt;
+  std::unique_ptr<PPC32Got2Section> ppc32Got2;
+  std::unique_ptr<IBTPltSection> ibtPlt;
+  std::unique_ptr<RelocationBaseSection> relaPlt;
+  std::unique_ptr<RelocationBaseSection> relaIplt;
+  std::unique_ptr<StringTableSection> shStrTab;
+  std::unique_ptr<StringTableSection> strTab;
+  std::unique_ptr<SymbolTableBaseSection> symTab;
+  std::unique_ptr<SymtabShndxSection> symTabShndx;
+
+  void reset();
 };
 
 extern InStruct in;
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1265,11 +1265,11 @@
 //   .rela.dyn
 //
 // DT_RELASZ is the total size of the included sections.
-static uint64_t addRelaSz(RelocationBaseSection *relaDyn) {
-  size_t size = relaDyn->getSize();
-  if (in.relaIplt->getParent() == relaDyn->getParent())
+static uint64_t addRelaSz(const RelocationBaseSection &relaDyn) {
+  size_t size = relaDyn.getSize();
+  if (in.relaIplt->getParent() == relaDyn.getParent())
     size += in.relaIplt->getSize();
-  if (in.relaPlt->getParent() == relaDyn->getParent())
+  if (in.relaPlt->getParent() == relaDyn.getParent())
     size += in.relaPlt->getSize();
   return size;
 }
@@ -1375,7 +1375,8 @@
       (in.relaIplt->isNeeded() &&
        part.relaDyn->getParent() == in.relaIplt->getParent())) {
     addInSec(part.relaDyn->dynamicTag, *part.relaDyn);
-    entries.emplace_back(part.relaDyn->sizeDynamicTag, addRelaSz(part.relaDyn));
+    entries.emplace_back(part.relaDyn->sizeDynamicTag,
+                         addRelaSz(*part.relaDyn));
 
     bool isRela = config->isRela;
     addInt(isRela ? DT_RELAENT : DT_RELENT,
@@ -1626,7 +1627,7 @@
 }
 
 void RelocationBaseSection::finalizeContents() {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab;
+  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
 
   // When linking glibc statically, .rel{,a}.plt contains R_*_IRELATIVE
   // relocations due to IFUNC (e.g. strcpy). sh_link will be set to 0 in that
@@ -1636,11 +1637,11 @@
   else
     getParent()->link = 0;
 
-  if (in.relaPlt == this && in.gotPlt->getParent()) {
+  if (in.relaPlt.get() == this && in.gotPlt->getParent()) {
     getParent()->flags |= ELF::SHF_INFO_LINK;
     getParent()->info = in.gotPlt->getParent()->sectionIndex;
   }
-  if (in.relaIplt == this && in.igotPlt->getParent()) {
+  if (in.relaIplt.get() == this && in.igotPlt->getParent()) {
     getParent()->flags |= ELF::SHF_INFO_LINK;
     getParent()->info = in.igotPlt->getParent()->sectionIndex;
   }
@@ -1677,7 +1678,7 @@
 }
 
 template <class ELFT> void RelocationSection<ELFT>::writeTo(uint8_t *buf) {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab;
+  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
 
   parallelForEach(relocs,
                   [symTab](DynamicReloc &rel) { rel.computeRaw(symTab); });
@@ -1772,8 +1773,8 @@
   for (const DynamicReloc &rel : relocs) {
     Elf_Rela r;
     r.r_offset = rel.getOffset();
-    r.setSymbolAndType(rel.getSymIndex(getPartition().dynSymTab), rel.type,
-                       false);
+    r.setSymbolAndType(rel.getSymIndex(getPartition().dynSymTab.get()),
+                       rel.type, false);
     if (config->isRela)
       r.r_addend = rel.computeAddend();
 
@@ -2099,7 +2100,7 @@
 
   // Only the main partition's dynsym indexes are stored in the symbols
   // themselves. All other partitions use a lookup table.
-  if (this == mainPart->dynSymTab) {
+  if (this == mainPart->dynSymTab.get()) {
     size_t i = 0;
     for (const SymbolTableEntry &s : symbols)
       s.sym->dynsymIndex = ++i;
@@ -2145,7 +2146,7 @@
 }
 
 size_t SymbolTableBaseSection::getSymbolIndex(Symbol *sym) {
-  if (this == mainPart->dynSymTab)
+  if (this == mainPart->dynSymTab.get())
     return sym->dynsymIndex;
 
   // Initializes symbol lookup tables lazily. This is used only for -r,
@@ -2182,8 +2183,6 @@
 }
 
 static uint32_t getSymSectionIndex(Symbol *sym) {
-  if (getCommonSec(sym))
-    return SHN_COMMON;
   assert(!(sym->needsCopy && sym->isObject()));
   if (!isa<Defined>(sym) || sym->needsCopy)
     return SHN_UNDEF;
@@ -2205,10 +2204,10 @@
     Symbol *sym = ent.sym;
     bool isDefinedHere = type == SHT_SYMTAB || sym->partition == partition;
 
-    // Set st_info and st_other.
-    eSym->st_other = 0;
+    // Set st_name, st_info and st_other.
+    eSym->st_name = ent.strTabOffset;
     eSym->setBindingAndType(sym->binding, sym->type);
-    eSym->setVisibility(sym->visibility);
+    eSym->st_other = sym->visibility;
 
     // The 3 most significant bits of st_other are used by OpenPOWER ABI.
     // See getPPC64GlobalEntryToLocalEntryOffset() for more details.
@@ -2219,30 +2218,29 @@
     else if (config->emachine == EM_AARCH64)
       eSym->st_other |= sym->stOther & STO_AARCH64_VARIANT_PCS;
 
-    eSym->st_name = ent.strTabOffset;
-    if (isDefinedHere)
-      eSym->st_shndx = getSymSectionIndex(ent.sym);
-    else
-      eSym->st_shndx = 0;
-
-    // Copy symbol size if it is a defined symbol. st_size is not significant
-    // for undefined symbols, so whether copying it or not is up to us if that's
-    // the case. We'll leave it as zero because by not setting a value, we can
-    // get the exact same outputs for two sets of input files that differ only
-    // in undefined symbol size in DSOs.
-    if (eSym->st_shndx == SHN_UNDEF || !isDefinedHere)
-      eSym->st_size = 0;
-    else
-      eSym->st_size = sym->getSize();
-
-    // st_value is usually an address of a symbol, but that has a special
-    // meaning for uninstantiated common symbols (--no-define-common).
-    if (BssSection *commonSec = getCommonSec(ent.sym))
+    if (BssSection *commonSec = getCommonSec(sym)) {
+      // st_value is usually an address of a symbol, but that has a special
+      // meaning for uninstantiated common symbols (--no-define-common).
+      eSym->st_shndx = SHN_COMMON;
       eSym->st_value = commonSec->alignment;
-    else if (isDefinedHere)
-      eSym->st_value = sym->getVA();
-    else
-      eSym->st_value = 0;
+      eSym->st_size = cast<Defined>(sym)->size;
+    } else {
+      const uint32_t shndx = getSymSectionIndex(sym);
+      if (isDefinedHere) {
+        eSym->st_shndx = shndx;
+        eSym->st_value = sym->getVA();
+        // Copy symbol size if it is a defined symbol. st_size is not
+        // significant for undefined symbols, so whether copying it or not is up
+        // to us if that's the case. We'll leave it as zero because by not
+        // setting a value, we can get the exact same outputs for two sets of
+        // input files that differ only in undefined symbol size in DSOs.
+        eSym->st_size = shndx != SHN_UNDEF ? cast<Defined>(sym)->size : 0;
+      } else {
+        eSym->st_shndx = 0;
+        eSym->st_value = 0;
+        eSym->st_size = 0;
+      }
+    }
 
     ++eSym;
   }
@@ -2293,7 +2291,7 @@
   // we need to write actual index, otherwise, we must write SHN_UNDEF(0).
   buf += 4; // Ignore .symtab[0] entry.
   for (const SymbolTableEntry &entry : in.symTab->getSymbols()) {
-    if (getSymSectionIndex(entry.sym) == SHN_XINDEX)
+    if (!getCommonSec(entry.sym) && getSymSectionIndex(entry.sym) == SHN_XINDEX)
       write32(buf, entry.sym->getOutputSection()->sectionIndex);
     buf += 4;
   }
@@ -2476,7 +2474,7 @@
 }
 
 void HashTableSection::finalizeContents() {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab;
+  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
 
   if (OutputSection *sec = symTab->getParent())
     getParent()->link = sec->sectionIndex;
@@ -2490,7 +2488,7 @@
 }
 
 void HashTableSection::writeTo(uint8_t *buf) {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab;
+  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
 
   // See comment in GnuHashTableSection::writeTo.
   memset(buf, 0, size);
@@ -3794,8 +3792,9 @@
     write32(buf, mainPart->dynStrTab->getVA() + partitions[i].nameStrTab - va);
     write32(buf + 4, partitions[i].elfHeader->getVA() - (va + 4));
 
-    SyntheticSection *next =
-        i == partitions.size() - 1 ? in.partEnd : partitions[i + 1].elfHeader;
+    SyntheticSection *next = i == partitions.size() - 1
+                                 ? in.partEnd.get()
+                                 : partitions[i + 1].elfHeader.get();
     write32(buf + 8, next->getVA() - partitions[i].elfHeader->getVA());
 
     va += 12;
@@ -3803,6 +3802,30 @@
   }
 }
 
+void InStruct::reset() {
+  attributes.reset();
+  bss.reset();
+  bssRelRo.reset();
+  got.reset();
+  gotPlt.reset();
+  igotPlt.reset();
+  ppc64LongBranchTarget.reset();
+  mipsGot.reset();
+  mipsRldMap.reset();
+  partEnd.reset();
+  partIndex.reset();
+  plt.reset();
+  iplt.reset();
+  ppc32Got2.reset();
+  ibtPlt.reset();
+  relaPlt.reset();
+  relaIplt.reset();
+  shStrTab.reset();
+  strTab.reset();
+  symTab.reset();
+  symTabShndx.reset();
+}
+
 InStruct elf::in;
 
 std::vector<Partition> elf::partitions;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -299,18 +299,18 @@
 
   auto add = [](SyntheticSection &sec) { inputSections.push_back(&sec); };
 
-  in.shStrTab = make<StringTableSection>(".shstrtab", false);
+  in.shStrTab = std::make_unique<StringTableSection>(".shstrtab", false);
 
   Out::programHeaders = make<OutputSection>("", 0, SHF_ALLOC);
   Out::programHeaders->alignment = config->wordsize;
 
   if (config->strip != StripPolicy::All) {
-    in.strTab = make<StringTableSection>(".strtab", false);
-    in.symTab = make<SymbolTableSection<ELFT>>(*in.strTab);
-    in.symTabShndx = make<SymtabShndxSection>();
+    in.strTab = std::make_unique<StringTableSection>(".strtab", false);
+    in.symTab = std::make_unique<SymbolTableSection<ELFT>>(*in.strTab);
+    in.symTabShndx = std::make_unique<SymtabShndxSection>();
   }
 
-  in.bss = make<BssSection>(".bss", 0, 1);
+  in.bss = std::make_unique<BssSection>(".bss", 0, 1);
   add(*in.bss);
 
   // If there is a SECTIONS command and a .data.rel.ro section name use name
@@ -318,14 +318,14 @@
   // This makes sure our relro is contiguous.
   bool hasDataRelRo =
       script->hasSectionsCommand && findSection(".data.rel.ro", 0);
-  in.bssRelRo =
-      make<BssSection>(hasDataRelRo ? ".data.rel.ro.bss" : ".bss.rel.ro", 0, 1);
+  in.bssRelRo = std::make_unique<BssSection>(
+      hasDataRelRo ? ".data.rel.ro.bss" : ".bss.rel.ro", 0, 1);
   add(*in.bssRelRo);
 
   // Add MIPS-specific sections.
   if (config->emachine == EM_MIPS) {
     if (!config->shared && config->hasDynSymTab) {
-      in.mipsRldMap = make<MipsRldMapSection>();
+      in.mipsRldMap = std::make_unique<MipsRldMapSection>();
       add(*in.mipsRldMap);
     }
     if (auto *sec = MipsAbiFlagsSection<ELFT>::create())
@@ -345,40 +345,43 @@
     };
 
     if (!part.name.empty()) {
-      part.elfHeader = make<PartitionElfHeaderSection<ELFT>>();
+      part.elfHeader = std::make_unique<PartitionElfHeaderSection<ELFT>>();
       part.elfHeader->name = part.name;
       add(*part.elfHeader);
 
-      part.programHeaders = make<PartitionProgramHeadersSection<ELFT>>();
+      part.programHeaders =
+          std::make_unique<PartitionProgramHeadersSection<ELFT>>();
       add(*part.programHeaders);
     }
 
     if (config->buildId != BuildIdKind::None) {
-      part.buildId = make<BuildIdSection>();
+      part.buildId = std::make_unique<BuildIdSection>();
       add(*part.buildId);
     }
 
-    part.dynStrTab = make<StringTableSection>(".dynstr", true);
-    part.dynSymTab = make<SymbolTableSection<ELFT>>(*part.dynStrTab);
-    part.dynamic = make<DynamicSection<ELFT>>();
+    part.dynStrTab = std::make_unique<StringTableSection>(".dynstr", true);
+    part.dynSymTab =
+        std::make_unique<SymbolTableSection<ELFT>>(*part.dynStrTab);
+    part.dynamic = std::make_unique<DynamicSection<ELFT>>();
     if (config->androidPackDynRelocs)
-      part.relaDyn = make<AndroidPackedRelocationSection<ELFT>>(relaDynName);
-    else
       part.relaDyn =
-          make<RelocationSection<ELFT>>(relaDynName, config->zCombreloc);
+          std::make_unique<AndroidPackedRelocationSection<ELFT>>(relaDynName);
+    else
+      part.relaDyn = std::make_unique<RelocationSection<ELFT>>(
+          relaDynName, config->zCombreloc);
 
     if (config->hasDynSymTab) {
       add(*part.dynSymTab);
 
-      part.verSym = make<VersionTableSection>();
+      part.verSym = std::make_unique<VersionTableSection>();
       add(*part.verSym);
 
       if (!namedVersionDefs().empty()) {
-        part.verDef = make<VersionDefinitionSection>();
+        part.verDef = std::make_unique<VersionDefinitionSection>();
         add(*part.verDef);
       }
 
-      part.verNeed = make<VersionNeedSection<ELFT>>();
+      part.verNeed = std::make_unique<VersionNeedSection<ELFT>>();
       add(*part.verNeed);
 
       if (config->gnuHash) {
@@ -397,23 +400,23 @@
     }
 
     if (config->relrPackDynRelocs) {
-      part.relrDyn = make<RelrSection<ELFT>>();
+      part.relrDyn = std::make_unique<RelrSection<ELFT>>();
       add(*part.relrDyn);
     }
 
     if (!config->relocatable) {
       if (config->ehFrameHdr) {
-        part.ehFrameHdr = make<EhFrameHeader>();
+        part.ehFrameHdr = std::make_unique<EhFrameHeader>();
         add(*part.ehFrameHdr);
       }
-      part.ehFrame = make<EhFrameSection>();
+      part.ehFrame = std::make_unique<EhFrameSection>();
       add(*part.ehFrame);
     }
 
     if (config->emachine == EM_ARM && !config->relocatable) {
       // The ARMExidxsyntheticsection replaces all the individual .ARM.exidx
       // InputSections.
-      part.armExidx = make<ARMExidxSyntheticSection>();
+      part.armExidx = std::make_unique<ARMExidxSyntheticSection>();
       add(*part.armExidx);
     }
   }
@@ -422,13 +425,14 @@
     // Create the partition end marker. This needs to be in partition number 255
     // so that it is sorted after all other partitions. It also has other
     // special handling (see createPhdrs() and combineEhSections()).
-    in.partEnd = make<BssSection>(".part.end", config->maxPageSize, 1);
+    in.partEnd =
+        std::make_unique<BssSection>(".part.end", config->maxPageSize, 1);
     in.partEnd->partition = 255;
     add(*in.partEnd);
 
-    in.partIndex = make<PartitionIndexSection>();
-    addOptionalRegular("__part_index_begin", in.partIndex, 0);
-    addOptionalRegular("__part_index_end", in.partIndex,
+    in.partIndex = std::make_unique<PartitionIndexSection>();
+    addOptionalRegular("__part_index_begin", in.partIndex.get(), 0);
+    addOptionalRegular("__part_index_end", in.partIndex.get(),
                        in.partIndex->getSize());
     add(*in.partIndex);
   }
@@ -436,26 +440,26 @@
   // Add .got. MIPS' .got is so different from the other archs,
   // it has its own class.
   if (config->emachine == EM_MIPS) {
-    in.mipsGot = make<MipsGotSection>();
+    in.mipsGot = std::make_unique<MipsGotSection>();
     add(*in.mipsGot);
   } else {
-    in.got = make<GotSection>();
+    in.got = std::make_unique<GotSection>();
     add(*in.got);
   }
 
   if (config->emachine == EM_PPC) {
-    in.ppc32Got2 = make<PPC32Got2Section>();
+    in.ppc32Got2 = std::make_unique<PPC32Got2Section>();
     add(*in.ppc32Got2);
   }
 
   if (config->emachine == EM_PPC64) {
-    in.ppc64LongBranchTarget = make<PPC64LongBranchTargetSection>();
+    in.ppc64LongBranchTarget = std::make_unique<PPC64LongBranchTargetSection>();
     add(*in.ppc64LongBranchTarget);
   }
 
-  in.gotPlt = make<GotPltSection>();
+  in.gotPlt = std::make_unique<GotPltSection>();
   add(*in.gotPlt);
-  in.igotPlt = make<IgotPltSection>();
+  in.igotPlt = std::make_unique<IgotPltSection>();
   add(*in.igotPlt);
 
   // _GLOBAL_OFFSET_TABLE_ is defined relative to either .got.plt or .got. Treat
@@ -472,7 +476,7 @@
 
   // We always need to add rel[a].plt to output if it has entries.
   // Even for static linking it can contain R_[*]_IRELATIVE relocations.
-  in.relaPlt = make<RelocationSection<ELFT>>(
+  in.relaPlt = std::make_unique<RelocationSection<ELFT>>(
       config->isRela ? ".rela.plt" : ".rel.plt", /*sort=*/false);
   add(*in.relaPlt);
 
@@ -482,21 +486,23 @@
   // that would cause a section type mismatch. However, because the Android
   // dynamic loader reads .rel.plt after .rel.dyn, we can get the desired
   // behaviour by placing the iplt section in .rel.plt.
-  in.relaIplt = make<RelocationSection<ELFT>>(
+  in.relaIplt = std::make_unique<RelocationSection<ELFT>>(
       config->androidPackDynRelocs ? in.relaPlt->name : relaDynName,
       /*sort=*/false);
   add(*in.relaIplt);
 
   if ((config->emachine == EM_386 || config->emachine == EM_X86_64) &&
       (config->andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT)) {
-    in.ibtPlt = make<IBTPltSection>();
+    in.ibtPlt = std::make_unique<IBTPltSection>();
     add(*in.ibtPlt);
   }
 
-  in.plt = config->emachine == EM_PPC ? make<PPC32GlinkSection>()
-                                      : make<PltSection>();
+  if (config->emachine == EM_PPC)
+    in.plt = std::make_unique<PPC32GlinkSection>();
+  else
+    in.plt = std::make_unique<PltSection>();
   add(*in.plt);
-  in.iplt = make<IpltSection>();
+  in.iplt = std::make_unique<IpltSection>();
   add(*in.iplt);
 
   if (config->andFeatures)
@@ -557,9 +563,6 @@
   for (Partition &part : partitions)
     setPhdrs(part);
 
-  if (config->relocatable)
-    for (OutputSection *sec : outputSections)
-      sec->addr = 0;
 
   // Handle --print-map(-M)/--Map, --why-extract=, --cref and
   // --print-archive-stats=. Dump them before checkSections() because the files
@@ -1059,17 +1062,17 @@
   if (ElfSym::globalOffsetTable) {
     // The _GLOBAL_OFFSET_TABLE_ symbol is defined by target convention usually
     // to the start of the .got or .got.plt section.
-    InputSection *gotSection = in.gotPlt;
+    InputSection *sec = in.gotPlt.get();
     if (!target->gotBaseSymInGotPlt)
-      gotSection = in.mipsGot ? cast<InputSection>(in.mipsGot)
-                              : cast<InputSection>(in.got);
-    ElfSym::globalOffsetTable->section = gotSection;
+      sec = in.mipsGot.get() ? cast<InputSection>(in.mipsGot.get())
+                             : cast<InputSection>(in.got.get());
+    ElfSym::globalOffsetTable->section = sec;
   }
 
   // .rela_iplt_{start,end} mark the start and the end of in.relaIplt.
   if (ElfSym::relaIpltStart && in.relaIplt->isNeeded()) {
-    ElfSym::relaIpltStart->section = in.relaIplt;
-    ElfSym::relaIpltEnd->section = in.relaIplt;
+    ElfSym::relaIpltStart->section = in.relaIplt.get();
+    ElfSym::relaIpltEnd->section = in.relaIplt.get();
     ElfSym::relaIpltEnd->value = in.relaIplt->getSize();
   }
 
@@ -1646,7 +1649,7 @@
   // can assign Virtual Addresses to OutputSections that are not monotonically
   // increasing.
   for (Partition &part : partitions)
-    finalizeSynthetic(part.armExidx);
+    finalizeSynthetic(part.armExidx.get());
   resolveShfLinkOrder();
 
   // Converts call x@GDPLT to call __tls_get_addr
@@ -1699,6 +1702,10 @@
     }
   }
 
+  if (config->relocatable)
+    for (OutputSection *sec : outputSections)
+      sec->addr = 0;
+
   // If addrExpr is set, the address may not be a multiple of the alignment.
   // Warn because this is error-prone.
   for (SectionCommand *cmd : script->sectionCommands)
@@ -1774,12 +1781,11 @@
     // Delete all fall through jump instructions.  Also, check if two
     // consecutive jump instructions can be flipped so that a fall
     // through jmp instruction can be deleted.
-    parallelForEachN(0, sections.size(), [&](size_t i) {
+    for (size_t i = 0, e = sections.size(); i != e; ++i) {
       InputSection *next = i + 1 < sections.size() ? sections[i + 1] : nullptr;
-      InputSection &is = *sections[i];
-      result[i] =
-          target->deleteFallThruJmpInsn(is, is.getFile<ELFT>(), next) ? 1 : 0;
-    });
+      InputSection &sec = *sections[i];
+      result[i] = target->deleteFallThruJmpInsn(sec, sec.file, next) ? 1 : 0;
+    }
     size_t numDeleted = std::count(result.begin(), result.end(), 1);
     if (numDeleted > 0) {
       script->assignAddresses();
@@ -1864,9 +1870,9 @@
   // Even the author of gold doesn't remember why gold behaves that way.
   // https://sourceware.org/ml/binutils/2002-03/msg00360.html
   if (mainPart->dynamic->parent)
-    symtab->addSymbol(Defined{/*file=*/nullptr, "_DYNAMIC", STB_WEAK,
-                              STV_HIDDEN, STT_NOTYPE,
-                              /*value=*/0, /*size=*/0, mainPart->dynamic});
+    symtab->addSymbol(
+        Defined{/*file=*/nullptr, "_DYNAMIC", STB_WEAK, STV_HIDDEN, STT_NOTYPE,
+                /*value=*/0, /*size=*/0, mainPart->dynamic.get()});
 
   // Define __rel[a]_iplt_{start,end} symbols if needed.
   addRelIpltSymbols();
@@ -1909,7 +1915,7 @@
     // pieces. The relocation scan uses those pieces, so this has to be
     // earlier.
     for (Partition &part : partitions)
-      finalizeSynthetic(part.ehFrame);
+      finalizeSynthetic(part.ehFrame.get());
   }
 
   for (Symbol *sym : symtab->symbols())
@@ -2075,35 +2081,35 @@
   {
     llvm::TimeTraceScope timeScope("Finalize synthetic sections");
 
-    finalizeSynthetic(in.bss);
-    finalizeSynthetic(in.bssRelRo);
-    finalizeSynthetic(in.symTabShndx);
-    finalizeSynthetic(in.shStrTab);
-    finalizeSynthetic(in.strTab);
-    finalizeSynthetic(in.got);
-    finalizeSynthetic(in.mipsGot);
-    finalizeSynthetic(in.igotPlt);
-    finalizeSynthetic(in.gotPlt);
-    finalizeSynthetic(in.relaIplt);
-    finalizeSynthetic(in.relaPlt);
-    finalizeSynthetic(in.plt);
-    finalizeSynthetic(in.iplt);
-    finalizeSynthetic(in.ppc32Got2);
-    finalizeSynthetic(in.partIndex);
+    finalizeSynthetic(in.bss.get());
+    finalizeSynthetic(in.bssRelRo.get());
+    finalizeSynthetic(in.symTabShndx.get());
+    finalizeSynthetic(in.shStrTab.get());
+    finalizeSynthetic(in.strTab.get());
+    finalizeSynthetic(in.got.get());
+    finalizeSynthetic(in.mipsGot.get());
+    finalizeSynthetic(in.igotPlt.get());
+    finalizeSynthetic(in.gotPlt.get());
+    finalizeSynthetic(in.relaIplt.get());
+    finalizeSynthetic(in.relaPlt.get());
+    finalizeSynthetic(in.plt.get());
+    finalizeSynthetic(in.iplt.get());
+    finalizeSynthetic(in.ppc32Got2.get());
+    finalizeSynthetic(in.partIndex.get());
 
     // Dynamic section must be the last one in this list and dynamic
     // symbol table section (dynSymTab) must be the first one.
     for (Partition &part : partitions) {
-      finalizeSynthetic(part.dynSymTab);
+      finalizeSynthetic(part.dynSymTab.get());
       finalizeSynthetic(part.gnuHashTab);
       finalizeSynthetic(part.hashTab);
-      finalizeSynthetic(part.verDef);
-      finalizeSynthetic(part.relaDyn);
-      finalizeSynthetic(part.relrDyn);
-      finalizeSynthetic(part.ehFrameHdr);
-      finalizeSynthetic(part.verSym);
-      finalizeSynthetic(part.verNeed);
-      finalizeSynthetic(part.dynamic);
+      finalizeSynthetic(part.verDef.get());
+      finalizeSynthetic(part.relaDyn.get());
+      finalizeSynthetic(part.relrDyn.get());
+      finalizeSynthetic(part.ehFrameHdr.get());
+      finalizeSynthetic(part.verSym.get());
+      finalizeSynthetic(part.verNeed.get());
+      finalizeSynthetic(part.dynamic.get());
     }
   }
 
@@ -2139,8 +2145,8 @@
     llvm::TimeTraceScope timeScope("Finalize synthetic sections");
     // finalizeAddressDependentContent may have added local symbols to the
     // static symbol table.
-    finalizeSynthetic(in.symTab);
-    finalizeSynthetic(in.ppc64LongBranchTarget);
+    finalizeSynthetic(in.symTab.get());
+    finalizeSynthetic(in.ppc64LongBranchTarget.get());
   }
 
   // Relaxation to delete inter-basic block jumps created by basic block
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -908,13 +908,28 @@
   uint32_t sectionIndex = 0;
   for (OutputSegment *seg : outputSegments) {
     seg->sortOutputSections();
+    // References from thread-local variable sections are treated as offsets
+    // relative to the start of the thread-local data memory area, which
+    // is initialized via copying all the TLV data sections (which are all
+    // contiguous). If later data sections require a greater alignment than
+    // earlier ones, the offsets of data within those sections won't be
+    // guaranteed to aligned unless we normalize alignments. We therefore use
+    // the largest alignment for all TLV data sections.
+    uint32_t tlvAlign = 0;
+    for (const OutputSection *osec : seg->getSections())
+      if (isThreadLocalData(osec->flags) && osec->align > tlvAlign)
+        tlvAlign = osec->align;
+
     for (OutputSection *osec : seg->getSections()) {
       // Now that the output sections are sorted, assign the final
       // output section indices.
       if (!osec->isHidden())
         osec->index = ++sectionIndex;
-      if (!firstTLVDataSection && isThreadLocalData(osec->flags))
-        firstTLVDataSection = osec;
+      if (isThreadLocalData(osec->flags)) {
+        if (!firstTLVDataSection)
+          firstTLVDataSection = osec;
+        osec->align = tlvAlign;
+      }
 
       if (!isecPriorities.empty()) {
         if (auto *merged = dyn_cast<ConcatOutputSection>(osec)) {
diff --git a/lld/test/ELF/gc-sections.s b/lld/test/ELF/gc-sections.s
--- a/lld/test/ELF/gc-sections.s
+++ b/lld/test/ELF/gc-sections.s
@@ -157,6 +157,10 @@
 .section .init_array,"aw",@progbits
   .quad 0
 
+# Work around https://github.com/rust-lang/rust/issues/92181
+.section .init_array.00001,"aw",@progbits
+  .quad 0
+
 .section .preinit_array,"aw",@preinit_array
   .quad 0
 
diff --git a/lld/test/MachO/tlv.s b/lld/test/MachO/tlv.s
--- a/lld/test/MachO/tlv.s
+++ b/lld/test/MachO/tlv.s
@@ -24,12 +24,12 @@
 # RUN: llvm-objdump -d --bind --rebase %t/regular-and-tbss | FileCheck %s --check-prefixes=REG,TBSS,LINKEDIT
 # RUN: llvm-objdump --macho --section=__DATA,__thread_vars %t/regular-and-tbss | \
 # RUN:   FileCheck %s --check-prefix=REG-TBSS-TLVP
-# RUN: llvm-objdump --section-headers %t/regular-and-tbss | FileCheck %s --check-prefix=SECTION-ORDER
+# RUN: llvm-objdump --section-headers %t/regular-and-tbss | FileCheck %s --check-prefix=SECTIONS
 
 ## Check that we always put __thread_bss immediately after __thread_data,
 ## regardless of the order of the input files.
 # RUN: %lld -lSystem %t/tbss.o %t/regular.o -o %t/regular-and-tbss
-# RUN: llvm-objdump --section-headers %t/regular-and-tbss | FileCheck %s --check-prefix=SECTION-ORDER
+# RUN: llvm-objdump --section-headers %t/regular-and-tbss | FileCheck %s --check-prefix=SECTIONS
 
 # HEADER: MH_HAS_TLV_DESCRIPTORS
 
@@ -41,6 +41,7 @@
 # TBSS:       <_f>:
 # TBSS-NEXT:  leaq    {{.*}}(%rip), %rax  ## {{.*}} <_baz>
 # TBSS-NEXT:  leaq    {{.*}}(%rip), %rax  ## {{.*}} <_qux>
+# TBSS-NEXT:  leaq    {{.*}}(%rip), %rax  ## {{.*}} <_hoge>
 # TBSS-NEXT:  retq
 
 # REG-TLVP:      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
@@ -53,10 +54,12 @@
 
 # REG-TBSS-TLVP:      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 # REG-TBSS-TLVP-NEXT: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
-# REG-TBSS-TLVP-NEXT: 00 00 00 00 00 00 00 00 08 00 00 00 00 00 00 00
+# REG-TBSS-TLVP-NEXT: 00 00 00 00 00 00 00 00 10 00 00 00 00 00 00 00
 # REG-TBSS-TLVP-NEXT: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
-# REG-TBSS-TLVP-NEXT: 10 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
-# REG-TBSS-TLVP-NEXT: 00 00 00 00 00 00 00 00 18 00 00 00 00 00 00 00
+# REG-TBSS-TLVP-NEXT: 20 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+# REG-TBSS-TLVP-NEXT: 00 00 00 00 00 00 00 00 28 00 00 00 00 00 00 00
+# REG-TBSS-TLVP-NEXT: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+# REG-TBSS-TLVP-NEXT: 30 00 00 00 00 00 00 00
 
 ## Make sure we don't emit rebase opcodes for relocations in __thread_vars.
 # LINKEDIT:       Rebase table:
@@ -66,9 +69,14 @@
 # LINKEDIT:       __DATA  __thread_vars   0x{{[0-9a-f]*}}  pointer 0 libSystem __tlv_bootstrap
 # LINKEDIT:       __DATA  __thread_vars   0x{{[0-9a-f]*}}  pointer 0 libSystem __tlv_bootstrap
 
-# SECTION-ORDER:      __thread_data
-# SECTION-ORDER:      more_thread_data
-# SECTION-ORDER-NEXT: __thread_bss
+## Make sure we have an odd number of tlv vars, and that the __thread_vars
+## section starts 16-bytes aligned. This is the setup required for __thread_data
+## not to be automatically 16-bytes aligned, ensuring the linker does its
+## expected job of aligning _hoge$tlv$init.
+# SECTIONS:      __thread_vars {{[0-9]+}}8 {{[0-9]+}}0
+# SECTIONS:      __thread_data
+# SECTIONS:      more_thread_data
+# SECTIONS-NEXT: __thread_bss
 
 #--- regular.s
 .globl _main
@@ -102,10 +110,12 @@
 _f:
   mov _baz@TLVP(%rip), %rax
   mov _qux@TLVP(%rip), %rax
+  mov _hoge@TLVP(%rip), %rax
   ret
 
 .tbss _baz$tlv$init, 8, 3
 .tbss _qux$tlv$init, 8, 3
+.tbss _hoge$tlv$init, 16, 4
 
 .section __DATA,__thread_vars,thread_local_variables
 _baz:
@@ -116,3 +126,7 @@
   .quad  __tlv_bootstrap
   .quad  0
   .quad  _qux$tlv$init
+_hoge:
+  .quad  __tlv_bootstrap
+  .quad  0
+  .quad  _hoge$tlv$init
diff --git a/lldb/include/lldb/Core/Mangled.h b/lldb/include/lldb/Core/Mangled.h
--- a/lldb/include/lldb/Core/Mangled.h
+++ b/lldb/include/lldb/Core/Mangled.h
@@ -72,10 +72,10 @@
     return !(*this == rhs);
   }
 
-  /// Convert to pointer operator.
+  /// Convert to bool operator.
   ///
-  /// This allows code to check a Mangled object to see if it contains a valid
-  /// mangled name using code such as:
+  /// This allows code to check any Mangled objects to see if they contain
+  /// anything valid using code such as:
   ///
   /// \code
   /// Mangled mangled(...);
@@ -84,25 +84,9 @@
   /// \endcode
   ///
   /// \return
-  ///     A pointer to this object if either the mangled or unmangled
-  ///     name is set, NULL otherwise.
-  operator void *() const;
-
-  /// Logical NOT operator.
-  ///
-  /// This allows code to check a Mangled object to see if it contains an
-  /// empty mangled name using code such as:
-  ///
-  /// \code
-  /// Mangled mangled(...);
-  /// if (!mangled)
-  /// { ...
-  /// \endcode
-  ///
-  /// \return
-  ///     Returns \b true if the object has an empty mangled and
-  ///     unmangled name, \b false otherwise.
-  bool operator!() const;
+  ///     Returns \b true if either the mangled or unmangled name is set,
+  ///     \b false if the object has an empty mangled and unmangled name.
+  explicit operator bool() const;
 
   /// Clear the mangled and demangled values.
   void Clear();
diff --git a/lldb/include/lldb/Symbol/SymbolFile.h b/lldb/include/lldb/Symbol/SymbolFile.h
--- a/lldb/include/lldb/Symbol/SymbolFile.h
+++ b/lldb/include/lldb/Symbol/SymbolFile.h
@@ -67,8 +67,7 @@
 
   // Constructors and Destructors
   SymbolFile(lldb::ObjectFileSP objfile_sp)
-      : m_objfile_sp(std::move(objfile_sp)), m_abilities(0),
-        m_calculated_abilities(false) {}
+      : m_objfile_sp(std::move(objfile_sp)) {}
 
   ~SymbolFile() override = default;
 
@@ -326,6 +325,29 @@
   /// hasn't been indexed yet, or a valid duration if it has.
   virtual StatsDuration GetDebugInfoIndexTime() { return StatsDuration(0.0); }
 
+  /// Accessors for the bool that indicates if the debug info index was loaded
+  /// from, or saved to the module index cache.
+  ///
+  /// In statistics it is handy to know if a module's debug info was loaded from
+  /// or saved to the cache. When the debug info index is loaded from the cache
+  /// startup times can be faster. When the cache is enabled and the debug info
+  /// index is saved to the cache, debug sessions can be slower. These accessors
+  /// can be accessed by the statistics and emitted to help track these costs.
+  /// \{
+  bool GetDebugInfoIndexWasLoadedFromCache() const {
+    return m_index_was_loaded_from_cache;
+  }
+  void SetDebugInfoIndexWasLoadedFromCache() {
+    m_index_was_loaded_from_cache = true;
+  }
+  bool GetDebugInfoIndexWasSavedToCache() const {
+    return m_index_was_saved_to_cache;
+  }
+  void SetDebugInfoIndexWasSavedToCache() {
+    m_index_was_saved_to_cache = true;
+  }
+  /// \}
+
 protected:
   void AssertModuleLock();
   virtual uint32_t CalculateNumCompileUnits() = 0;
@@ -341,8 +363,10 @@
   llvm::Optional<std::vector<lldb::CompUnitSP>> m_compile_units;
   TypeList m_type_list;
   Symtab *m_symtab = nullptr;
-  uint32_t m_abilities;
-  bool m_calculated_abilities;
+  uint32_t m_abilities = 0;
+  bool m_calculated_abilities = false;
+  bool m_index_was_loaded_from_cache = false;
+  bool m_index_was_saved_to_cache = false;
 
 private:
   SymbolFile(const SymbolFile &) = delete;
diff --git a/lldb/include/lldb/Symbol/Symtab.h b/lldb/include/lldb/Symbol/Symtab.h
--- a/lldb/include/lldb/Symbol/Symtab.h
+++ b/lldb/include/lldb/Symbol/Symtab.h
@@ -212,6 +212,30 @@
   ///   false if the symbol table wasn't cached or was out of date.
   bool LoadFromCache();
 
+
+  /// Accessors for the bool that indicates if the debug info index was loaded
+  /// from, or saved to the module index cache.
+  ///
+  /// In statistics it is handy to know if a module's debug info was loaded from
+  /// or saved to the cache. When the debug info index is loaded from the cache
+  /// startup times can be faster. When the cache is enabled and the debug info
+  /// index is saved to the cache, debug sessions can be slower. These accessors
+  /// can be accessed by the statistics and emitted to help track these costs.
+  /// \{
+  bool GetWasLoadedFromCache() const {
+    return m_loaded_from_cache;
+  }
+  void SetWasLoadedFromCache() {
+    m_loaded_from_cache = true;
+  }
+  bool GetWasSavedToCache() const {
+    return m_saved_to_cache;
+  }
+  void SetWasSavedToCache() {
+    m_saved_to_cache = true;
+  }
+  /// \}
+
 protected:
   typedef std::vector<Symbol> collection;
   typedef collection::iterator iterator;
@@ -252,7 +276,8 @@
       m_name_to_symbol_indices;
   mutable std::recursive_mutex
       m_mutex; // Provide thread safety for this symbol table
-  bool m_file_addr_to_index_computed : 1, m_name_indexes_computed : 1;
+  bool m_file_addr_to_index_computed : 1, m_name_indexes_computed : 1,
+    m_loaded_from_cache : 1, m_saved_to_cache : 1;
 
 private:
   UniqueCStringMap<uint32_t> &
diff --git a/lldb/include/lldb/Target/Statistics.h b/lldb/include/lldb/Target/Statistics.h
--- a/lldb/include/lldb/Target/Statistics.h
+++ b/lldb/include/lldb/Target/Statistics.h
@@ -84,6 +84,10 @@
   double debug_parse_time = 0.0;
   double debug_index_time = 0.0;
   uint64_t debug_info_size = 0;
+  bool symtab_loaded_from_cache = false;
+  bool symtab_saved_to_cache = false;
+  bool debug_info_index_loaded_from_cache = false;
+  bool debug_info_index_saved_to_cache = false;
 };
 
 /// A class that represents statistics for a since lldb_private::Target.
diff --git a/lldb/include/lldb/Utility/Log.h b/lldb/include/lldb/Utility/Log.h
--- a/lldb/include/lldb/Utility/Log.h
+++ b/lldb/include/lldb/Utility/Log.h
@@ -211,8 +211,6 @@
   static uint32_t GetFlags(llvm::raw_ostream &stream, const ChannelMap::value_type &entry,
                            llvm::ArrayRef<const char *> categories);
 
-  static void DisableLoggingChild();
-
   Log(const Log &) = delete;
   void operator=(const Log &) = delete;
 };
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -18,8 +18,7 @@
 }
 
 let Command = "settings set" in {
-  def setset_global : Option<"global", "g">, Arg<"Filename">,
-    Completion<"DiskFile">,
+  def setset_global : Option<"global", "g">,
     Desc<"Apply the new value to the global default value.">;
   def setset_force : Option<"force", "f">,
     Desc<"Force an empty value to be accepted as the default.">;
diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -70,23 +70,13 @@
     SetValue(ConstString(name));
 }
 
-// Convert to pointer operator. This allows code to check any Mangled objects
+// Convert to bool operator. This allows code to check any Mangled objects
 // to see if they contain anything valid using code such as:
 //
 //  Mangled mangled(...);
 //  if (mangled)
 //  { ...
-Mangled::operator void *() const {
-  return (m_mangled) ? const_cast<Mangled *>(this) : nullptr;
-}
-
-// Logical NOT operator. This allows code to check any Mangled objects to see
-// if they are invalid using code such as:
-//
-//  Mangled mangled(...);
-//  if (!file_spec)
-//  { ...
-bool Mangled::operator!() const { return !m_mangled; }
+Mangled::operator bool() const { return m_mangled || m_demangled; }
 
 // Clear the mangled and demangled values.
 void Mangled::Clear() {
diff --git a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
--- a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
+++ b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
@@ -38,43 +38,40 @@
 using namespace lldb;
 using namespace lldb_private;
 
-static void FixupEnvironment(Environment &env) {
-#ifdef __ANDROID__
-  // If there is no PATH variable specified inside the environment then set the
-  // path to /system/bin. It is required because the default path used by
-  // execve() is wrong on android.
-  env.try_emplace("PATH", "/system/bin");
-#endif
+// Begin code running in the child process
+// NB: This code needs to be async-signal safe, since we're invoking fork from
+// multithreaded contexts.
+
+static void write_string(int error_fd, const char *str) {
+  int r = write(error_fd, str, strlen(str));
+  (void)r;
 }
 
 [[noreturn]] static void ExitWithError(int error_fd,
                                        const char *operation) {
   int err = errno;
-  llvm::raw_fd_ostream os(error_fd, true);
-  os << operation << " failed: " << llvm::sys::StrError(err);
-  os.flush();
+  write_string(error_fd, operation);
+  write_string(error_fd, " failed: ");
+  // strerror is not guaranteed to be async-signal safe, but it usually is.
+  write_string(error_fd, strerror(err));
   _exit(1);
 }
 
-static void DisableASLRIfRequested(int error_fd, const ProcessLaunchInfo &info) {
+static void DisableASLR(int error_fd) {
 #if defined(__linux__)
-  if (info.GetFlags().Test(lldb::eLaunchFlagDisableASLR)) {
-    const unsigned long personality_get_current = 0xffffffff;
-    int value = personality(personality_get_current);
-    if (value == -1)
-      ExitWithError(error_fd, "personality get");
-
-    value = personality(ADDR_NO_RANDOMIZE | value);
-    if (value == -1)
-      ExitWithError(error_fd, "personality set");
-  }
+  const unsigned long personality_get_current = 0xffffffff;
+  int value = personality(personality_get_current);
+  if (value == -1)
+    ExitWithError(error_fd, "personality get");
+
+  value = personality(ADDR_NO_RANDOMIZE | value);
+  if (value == -1)
+    ExitWithError(error_fd, "personality set");
 #endif
 }
 
-static void DupDescriptor(int error_fd, const FileSpec &file_spec, int fd,
-                          int flags) {
-  int target_fd = llvm::sys::RetryAfterSignal(-1, ::open,
-      file_spec.GetCString(), flags, 0666);
+static void DupDescriptor(int error_fd, const char *file, int fd, int flags) {
+  int target_fd = llvm::sys::RetryAfterSignal(-1, ::open, file, flags, 0666);
 
   if (target_fd == -1)
     ExitWithError(error_fd, "DupDescriptor-open");
@@ -88,44 +85,67 @@
   ::close(target_fd);
 }
 
-[[noreturn]] static void ChildFunc(int error_fd,
-                                   const ProcessLaunchInfo &info) {
-  if (info.GetFlags().Test(eLaunchFlagLaunchInSeparateProcessGroup)) {
+namespace {
+struct ForkFileAction {
+  ForkFileAction(const FileAction &act);
+
+  FileAction::Action action;
+  int fd;
+  std::string path;
+  int arg;
+};
+
+struct ForkLaunchInfo {
+  ForkLaunchInfo(const ProcessLaunchInfo &info);
+
+  bool separate_process_group;
+  bool debug;
+  bool disable_aslr;
+  std::string wd;
+  const char **argv;
+  Environment::Envp envp;
+  std::vector<ForkFileAction> actions;
+
+  bool has_action(int fd) const {
+    for (const ForkFileAction &action : actions) {
+      if (action.fd == fd)
+        return true;
+    }
+    return false;
+  }
+};
+} // namespace
+
+[[noreturn]] static void ChildFunc(int error_fd, const ForkLaunchInfo &info) {
+  if (info.separate_process_group) {
     if (setpgid(0, 0) != 0)
       ExitWithError(error_fd, "setpgid");
   }
 
-  for (size_t i = 0; i < info.GetNumFileActions(); ++i) {
-    const FileAction &action = *info.GetFileActionAtIndex(i);
-    switch (action.GetAction()) {
+  for (const ForkFileAction &action : info.actions) {
+    switch (action.action) {
     case FileAction::eFileActionClose:
-      if (close(action.GetFD()) != 0)
+      if (close(action.fd) != 0)
         ExitWithError(error_fd, "close");
       break;
     case FileAction::eFileActionDuplicate:
-      if (dup2(action.GetFD(), action.GetActionArgument()) == -1)
+      if (dup2(action.fd, action.arg) == -1)
         ExitWithError(error_fd, "dup2");
       break;
     case FileAction::eFileActionOpen:
-      DupDescriptor(error_fd, action.GetFileSpec(), action.GetFD(),
-                    action.GetActionArgument());
+      DupDescriptor(error_fd, action.path.c_str(), action.fd, action.arg);
       break;
     case FileAction::eFileActionNone:
       break;
     }
   }
 
-  const char **argv = info.GetArguments().GetConstArgumentVector();
-
   // Change working directory
-  if (info.GetWorkingDirectory() &&
-      0 != ::chdir(info.GetWorkingDirectory().GetCString()))
+  if (!info.wd.empty() && 0 != ::chdir(info.wd.c_str()))
     ExitWithError(error_fd, "chdir");
 
-  DisableASLRIfRequested(error_fd, info);
-  Environment env = info.GetEnvironment();
-  FixupEnvironment(env);
-  Environment::Envp envp = env.getEnvp();
+  if (info.disable_aslr)
+    DisableASLR(error_fd);
 
   // Clear the signal mask to prevent the child from being affected by any
   // masking done by the parent.
@@ -134,7 +154,7 @@
       pthread_sigmask(SIG_SETMASK, &set, nullptr) != 0)
     ExitWithError(error_fd, "pthread_sigmask");
 
-  if (info.GetFlags().Test(eLaunchFlagDebug)) {
+  if (info.debug) {
     // Do not inherit setgid powers.
     if (setgid(getgid()) != 0)
       ExitWithError(error_fd, "setgid");
@@ -143,6 +163,8 @@
     // Close everything besides stdin, stdout, and stderr that has no file
     // action to avoid leaking. Only do this when debugging, as elsewhere we
     // actually rely on passing open descriptors to child processes.
+    // NB: This code is not async-signal safe, but we currently do not launch
+    // processes for debugging from within multithreaded contexts.
 
     const llvm::StringRef proc_fd_path = "/proc/self/fd";
     std::error_code ec;
@@ -157,7 +179,7 @@
 
         // Don't close first three entries since they are stdin, stdout and
         // stderr.
-        if (fd > 2 && !info.GetFileActionForFD(fd) && fd != error_fd)
+        if (fd > 2 && !info.has_action(fd) && fd != error_fd)
           files_to_close.push_back(fd);
       }
       for (int file_to_close : files_to_close)
@@ -166,7 +188,7 @@
       // Since /proc/self/fd didn't work, trying the slow way instead.
       int max_fd = sysconf(_SC_OPEN_MAX);
       for (int fd = 3; fd < max_fd; ++fd)
-        if (!info.GetFileActionForFD(fd) && fd != error_fd)
+        if (!info.has_action(fd) && fd != error_fd)
           close(fd);
     }
 
@@ -176,7 +198,7 @@
   }
 
   // Execute.  We should never return...
-  execve(argv[0], const_cast<char *const *>(argv), envp);
+  execve(info.argv[0], const_cast<char *const *>(info.argv), info.envp);
 
 #if defined(__linux__)
   if (errno == ETXTBSY) {
@@ -189,7 +211,7 @@
     // Since this state should clear up quickly, wait a while and then give it
     // one more go.
     usleep(50000);
-    execve(argv[0], const_cast<char *const *>(argv), envp);
+    execve(info.argv[0], const_cast<char *const *>(info.argv), info.envp);
   }
 #endif
 
@@ -198,12 +220,43 @@
   ExitWithError(error_fd, "execve");
 }
 
+// End of code running in the child process.
+
+ForkFileAction::ForkFileAction(const FileAction &act)
+    : action(act.GetAction()), fd(act.GetFD()), path(act.GetPath().str()),
+      arg(act.GetActionArgument()) {}
+
+static std::vector<ForkFileAction>
+MakeForkActions(const ProcessLaunchInfo &info) {
+  std::vector<ForkFileAction> result;
+  for (size_t i = 0; i < info.GetNumFileActions(); ++i)
+    result.emplace_back(*info.GetFileActionAtIndex(i));
+  return result;
+}
+
+static Environment::Envp FixupEnvironment(Environment env) {
+#ifdef __ANDROID__
+  // If there is no PATH variable specified inside the environment then set the
+  // path to /system/bin. It is required because the default path used by
+  // execve() is wrong on android.
+  env.try_emplace("PATH", "/system/bin");
+#endif
+  return env.getEnvp();
+}
+
+ForkLaunchInfo::ForkLaunchInfo(const ProcessLaunchInfo &info)
+    : separate_process_group(
+          info.GetFlags().Test(eLaunchFlagLaunchInSeparateProcessGroup)),
+      debug(info.GetFlags().Test(eLaunchFlagDebug)),
+      disable_aslr(info.GetFlags().Test(eLaunchFlagDisableASLR)),
+      wd(info.GetWorkingDirectory().GetPath()),
+      argv(info.GetArguments().GetConstArgumentVector()),
+      envp(FixupEnvironment(info.GetEnvironment())),
+      actions(MakeForkActions(info)) {}
+
 HostProcess
 ProcessLauncherPosixFork::LaunchProcess(const ProcessLaunchInfo &launch_info,
                                         Status &error) {
-  char exe_path[PATH_MAX];
-  launch_info.GetExecutableFile().GetPath(exe_path, sizeof(exe_path));
-
   // A pipe used by the child process to report errors.
   PipePosix pipe;
   const bool child_processes_inherit = false;
@@ -211,6 +264,8 @@
   if (error.Fail())
     return HostProcess();
 
+  const ForkLaunchInfo fork_launch_info(launch_info);
+
   ::pid_t pid = ::fork();
   if (pid == -1) {
     // Fork failed
@@ -221,7 +276,7 @@
   if (pid == 0) {
     // child process
     pipe.CloseReadFileDescriptor();
-    ChildFunc(pipe.ReleaseWriteFileDescriptor(), launch_info);
+    ChildFunc(pipe.ReleaseWriteFileDescriptor(), fork_launch_info);
   }
 
   // parent process
diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.h b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.h
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.h
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.h
@@ -164,7 +164,7 @@
 
   static Status SetDefaultPtraceOpts(const lldb::pid_t);
 
-  void MonitorCallback(lldb::pid_t pid, bool exited, WaitStatus status);
+  void MonitorCallback(lldb::pid_t pid, WaitStatus status);
 
   void WaitForCloneNotification(::pid_t pid);
 
@@ -176,8 +176,7 @@
 
   void MonitorWatchpoint(NativeThreadLinux &thread, uint32_t wp_index);
 
-  void MonitorSignal(const siginfo_t &info, NativeThreadLinux &thread,
-                     bool exited);
+  void MonitorSignal(const siginfo_t &info, NativeThreadLinux &thread);
 
   bool HasThreadNoLock(lldb::tid_t thread_id);
 
diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -426,8 +426,7 @@
 }
 
 // Handles all waitpid events from the inferior process.
-void NativeProcessLinux::MonitorCallback(lldb::pid_t pid, bool exited,
-                                         WaitStatus status) {
+void NativeProcessLinux::MonitorCallback(lldb::pid_t pid, WaitStatus status) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   // Certain activities differ based on whether the pid is the tid of the main
@@ -435,7 +434,7 @@
   const bool is_main_thread = (pid == GetID());
 
   // Handle when the thread exits.
-  if (exited) {
+  if (status.type == WaitStatus::Exit || status.type == WaitStatus::Signal) {
     LLDB_LOG(log,
              "got exit status({0}) , tid = {1} ({2} main thread), process "
              "state = {3}",
@@ -485,7 +484,7 @@
     if (info.si_signo == SIGTRAP)
       MonitorSIGTRAP(info, *thread_sp);
     else
-      MonitorSignal(info, *thread_sp, exited);
+      MonitorSignal(info, *thread_sp);
   } else {
     if (info_err.GetError() == EINVAL) {
       // This is a group stop reception for this tid. We can reach here if we
@@ -753,7 +752,7 @@
   default:
     LLDB_LOG(log, "received unknown SIGTRAP stop event ({0}, pid {1} tid {2}",
              info.si_code, GetID(), thread.GetID());
-    MonitorSignal(info, thread, false);
+    MonitorSignal(info, thread);
     break;
   }
 }
@@ -801,7 +800,7 @@
 }
 
 void NativeProcessLinux::MonitorSignal(const siginfo_t &info,
-                                       NativeThreadLinux &thread, bool exited) {
+                                       NativeThreadLinux &thread) {
   const int signo = info.si_signo;
   const bool is_from_llgs = info.si_pid == getpid();
 
@@ -1962,16 +1961,11 @@
     }
 
     WaitStatus wait_status = WaitStatus::Decode(status);
-    bool exited = wait_status.type == WaitStatus::Exit ||
-                  (wait_status.type == WaitStatus::Signal &&
-                   wait_pid == static_cast<::pid_t>(GetID()));
 
-    LLDB_LOG(
-        log,
-        "waitpid (-1, &status, _) => pid = {0}, status = {1}, exited = {2}",
-        wait_pid, wait_status, exited);
+    LLDB_LOG(log, "waitpid (-1, &status, _) => pid = {0}, status = {1}",
+             wait_pid, wait_status);
 
-    MonitorCallback(wait_pid, exited, wait_status);
+    MonitorCallback(wait_pid, wait_status);
   }
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DIERef.h b/lldb/source/Plugins/SymbolFile/DWARF/DIERef.h
--- a/lldb/source/Plugins/SymbolFile/DWARF/DIERef.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DIERef.h
@@ -54,6 +54,37 @@
     return m_die_offset < other.m_die_offset;
   }
 
+  bool operator==(const DIERef &rhs) const {
+    return dwo_num() == rhs.dwo_num() && m_section == rhs.m_section &&
+           m_die_offset == rhs.m_die_offset;
+  }
+
+  bool operator!=(const DIERef &rhs) const { return !(*this == rhs); }
+
+  /// Decode a serialized version of this object from data.
+  ///
+  /// \param data
+  ///   The decoder object that references the serialized data.
+  ///
+  /// \param offset_ptr
+  ///   A pointer that contains the offset from which the data will be decoded
+  ///   from that gets updated as data gets decoded.
+  ///
+  /// \return
+  ///   Returns a valid DIERef if decoding succeeded, llvm::None if there was
+  ///   unsufficient or invalid values that were decoded.
+  static llvm::Optional<DIERef> Decode(const lldb_private::DataExtractor &data,
+                                       lldb::offset_t *offset_ptr);
+
+  /// Encode this object into a data encoder object.
+  ///
+  /// This allows this object to be serialized to disk.
+  ///
+  /// \param encoder
+  ///   A data encoder object that serialized bytes will be encoded into.
+  ///
+  void Encode(lldb_private::DataEncoder &encoder) const;
+
 private:
   uint32_t m_dwo_num : 30;
   uint32_t m_dwo_num_valid : 1;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DIERef.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DIERef.cpp
--- a/lldb/source/Plugins/SymbolFile/DWARF/DIERef.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DIERef.cpp
@@ -7,8 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "DIERef.h"
+#include "lldb/Utility/DataEncoder.h"
+#include "lldb/Utility/DataExtractor.h"
 #include "llvm/Support/Format.h"
 
+using namespace lldb;
+using namespace lldb_private;
+
 void llvm::format_provider<DIERef>::format(const DIERef &ref, raw_ostream &OS,
                                            StringRef Style) {
   if (ref.dwo_num())
@@ -16,3 +21,35 @@
   OS << (ref.section() == DIERef::DebugInfo ? "INFO" : "TYPE");
   OS << "/" << format_hex_no_prefix(ref.die_offset(), 8);
 }
+
+constexpr uint32_t k_dwo_num_mask = 0x3FFFFFFF;
+constexpr uint32_t k_dwo_num_valid_bitmask = (1u << 30);
+constexpr uint32_t k_section_bitmask = (1u << 31);
+
+llvm::Optional<DIERef> DIERef::Decode(const DataExtractor &data,
+                                      lldb::offset_t *offset_ptr) {
+  const uint32_t bitfield_storage = data.GetU32(offset_ptr);
+  uint32_t dwo_num = bitfield_storage & k_dwo_num_mask;
+  bool dwo_num_valid = (bitfield_storage & (k_dwo_num_valid_bitmask)) != 0;
+  Section section = (Section)((bitfield_storage & (k_section_bitmask)) != 0);
+  // DIE offsets can't be zero and if we fail to decode something from data,
+  // it will return 0
+  dw_offset_t die_offset = data.GetU32(offset_ptr);
+  if (die_offset == 0)
+    return llvm::None;
+  if (dwo_num_valid)
+    return DIERef(dwo_num, section, die_offset);
+  else
+    return DIERef(llvm::None, section, die_offset);
+}
+
+void DIERef::Encode(DataEncoder &encoder) const {
+  uint32_t bitfield_storage = m_dwo_num;
+  if (m_dwo_num_valid)
+    bitfield_storage |= k_dwo_num_valid_bitmask;
+  if (m_section)
+    bitfield_storage |= k_section_bitmask;
+  encoder.AppendU32(bitfield_storage);
+  static_assert(sizeof(m_die_offset) == 4, "m_die_offset must be 4 bytes");
+  encoder.AppendU32(m_die_offset);
+}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
@@ -55,7 +55,7 @@
 
   void Dump(Stream &s) override;
 
-private:
+  // Make IndexSet public so we can unit test the encoding and decoding logic.
   struct IndexSet {
     NameToDIE function_basenames;
     NameToDIE function_fullnames;
@@ -65,21 +65,113 @@
     NameToDIE globals;
     NameToDIE types;
     NameToDIE namespaces;
+    bool Decode(const DataExtractor &data, lldb::offset_t *offset_ptr);
+    void Encode(DataEncoder &encoder) const;
+    bool operator==(const IndexSet &rhs) const {
+      return function_basenames == rhs.function_basenames &&
+             function_fullnames == rhs.function_fullnames &&
+             function_methods == rhs.function_methods &&
+             function_selectors == rhs.function_selectors &&
+             objc_class_selectors == rhs.objc_class_selectors &&
+             globals == rhs.globals && types == rhs.types &&
+             namespaces == rhs.namespaces;
+    }
   };
+
+private:
   void Index();
+
+  /// Decode a serialized version of this object from data.
+  ///
+  /// \param data
+  ///   The decoder object that references the serialized data.
+  ///
+  /// \param offset_ptr
+  ///   A pointer that contains the offset from which the data will be decoded
+  ///   from that gets updated as data gets decoded.
+  ///
+  /// \param strtab
+  ///   All strings in cache files are put into string tables for efficiency
+  ///   and cache file size reduction. Strings are stored as uint32_t string
+  ///   table offsets in the cache data.
+  bool Decode(const DataExtractor &data, lldb::offset_t *offset_ptr,
+              bool &signature_mismatch);
+
+  /// Encode this object into a data encoder object.
+  ///
+  /// This allows this object to be serialized to disk.
+  ///
+  /// \param encoder
+  ///   A data encoder object that serialized bytes will be encoded into.
+  ///
+  /// \param strtab
+  ///   All strings in cache files are put into string tables for efficiency
+  ///   and cache file size reduction. Strings are stored as uint32_t string
+  ///   table offsets in the cache data.
+  ///
+  /// \return
+  ///   True if the symbol table's object file can generate a valid signature
+  ///   and all data for the symbol table was encoded, false otherwise.
+  bool Encode(DataEncoder &encoder) const;
+
+  /// Get the cache key string for this symbol table.
+  ///
+  /// The cache key must start with the module's cache key and is followed
+  /// by information that indicates this key is for caching the symbol table
+  /// contents and should also include the has of the object file. A module can
+  /// be represented by an ObjectFile object for the main executable, but can
+  /// also have a symbol file that is from the same or a different object file.
+  /// This means we might have two symbol tables cached in the index cache, one
+  /// for the main executable and one for the symbol file.
+  ///
+  /// \return
+  ///   The unique cache key used to save and retrieve data from the index
+  ///   cache.
+  std::string GetCacheKey();
+
+  /// Save the symbol table data out into a cache.
+  ///
+  /// The symbol table will only be saved to a cache file if caching is enabled.
+  ///
+  /// We cache the contents of the symbol table since symbol tables in LLDB take
+  /// some time to initialize. This is due to the many sources for data that are
+  /// used to create a symbol table:
+  /// - standard symbol table
+  /// - dynamic symbol table (ELF)
+  /// - compressed debug info sections
+  /// - unwind information
+  /// - function pointers found in runtimes for global constructor/destructors
+  /// - other sources.
+  /// All of the above sources are combined and one symbol table results after
+  /// all sources have been considered.
+  void SaveToCache();
+
+  /// Load the symbol table from the index cache.
+  ///
+  /// Quickly load the finalized symbol table from the index cache. This saves
+  /// time when the debugger starts up. The index cache file for the symbol
+  /// table has the modification time set to the same time as the main module.
+  /// If the cache file exists and the modification times match, we will load
+  /// the symbol table from the serlized cache file.
+  ///
+  /// \return
+  ///   True if the symbol table was successfully loaded from the index cache,
+  ///   false if the symbol table wasn't cached or was out of date.
+  bool LoadFromCache();
+
   void IndexUnit(DWARFUnit &unit, SymbolFileDWARFDwo *dwp, IndexSet &set);
 
   static void IndexUnitImpl(DWARFUnit &unit,
                             const lldb::LanguageType cu_language,
                             IndexSet &set);
 
-  /// The DWARF file which we are indexing. Set to nullptr after the index is
-  /// built.
+  /// The DWARF file which we are indexing.
   SymbolFileDWARF *m_dwarf;
   /// Which dwarf units should we skip while building the index.
   llvm::DenseSet<dw_offset_t> m_units_to_avoid;
 
   IndexSet m_set;
+  bool m_indexed = false;
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
@@ -12,9 +12,12 @@
 #include "Plugins/SymbolFile/DWARF/DWARFDeclContext.h"
 #include "Plugins/SymbolFile/DWARF/LogChannelDWARF.h"
 #include "Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h"
+#include "lldb/Core/DataFileCache.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/Progress.h"
 #include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Utility/DataEncoder.h"
+#include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/Timer.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -24,17 +27,19 @@
 using namespace lldb;
 
 void ManualDWARFIndex::Index() {
-  if (!m_dwarf)
+  if (m_indexed)
     return;
-
-  SymbolFileDWARF &main_dwarf = *m_dwarf;
-  m_dwarf = nullptr;
+  m_indexed = true;
 
   ElapsedTime elapsed(m_index_time);
-  LLDB_SCOPED_TIMERF("%p", static_cast<void *>(&main_dwarf));
+  LLDB_SCOPED_TIMERF("%p", static_cast<void *>(m_dwarf));
+  if (LoadFromCache()) {
+    m_dwarf->SetDebugInfoIndexWasLoadedFromCache();
+    return;
+  }
 
-  DWARFDebugInfo &main_info = main_dwarf.DebugInfo();
-  SymbolFileDWARFDwo *dwp_dwarf = main_dwarf.GetDwpSymbolFile().get();
+  DWARFDebugInfo &main_info = m_dwarf->DebugInfo();
+  SymbolFileDWARFDwo *dwp_dwarf = m_dwarf->GetDwpSymbolFile().get();
   DWARFDebugInfo *dwp_info = dwp_dwarf ? &dwp_dwarf->DebugInfo() : nullptr;
 
   std::vector<DWARFUnit *> units_to_index;
@@ -125,6 +130,8 @@
   pool.async(finalize_fn, &IndexSet::types);
   pool.async(finalize_fn, &IndexSet::namespaces);
   pool.wait();
+
+  SaveToCache();
 }
 
 void ManualDWARFIndex::IndexUnit(DWARFUnit &unit, SymbolFileDWARFDwo *dwp,
@@ -480,3 +487,214 @@
   s.Printf("\nNamespaces:\n");
   m_set.namespaces.Dump(&s);
 }
+
+constexpr llvm::StringLiteral kIdentifierManualDWARFIndex("DIDX");
+// Define IDs for the different tables when encoding and decoding the
+// ManualDWARFIndex NameToDIE objects so we can avoid saving any empty maps.
+enum DataID {
+  kDataIDFunctionBasenames = 1u,
+  kDataIDFunctionFullnames,
+  kDataIDFunctionMethods,
+  kDataIDFunctionSelectors,
+  kDataIDFunctionObjcClassSelectors,
+  kDataIDGlobals,
+  kDataIDTypes,
+  kDataIDNamespaces,
+  kDataIDEnd = 255u,
+
+};
+constexpr uint32_t CURRENT_CACHE_VERSION = 1;
+
+bool ManualDWARFIndex::IndexSet::Decode(const DataExtractor &data,
+                                        lldb::offset_t *offset_ptr) {
+  StringTableReader strtab;
+  // We now decode the string table for all strings in the data cache file.
+  if (!strtab.Decode(data, offset_ptr))
+    return false;
+
+  llvm::StringRef identifier((const char *)data.GetData(offset_ptr, 4), 4);
+  if (identifier != kIdentifierManualDWARFIndex)
+    return false;
+  const uint32_t version = data.GetU32(offset_ptr);
+  if (version != CURRENT_CACHE_VERSION)
+    return false;
+
+  bool done = false;
+  while (!done) {
+    switch (data.GetU8(offset_ptr)) {
+    default:
+      // If we got here, this is not expected, we expect the data IDs to match
+      // one of the values from the DataID enumeration.
+      return false;
+    case kDataIDFunctionBasenames:
+      if (!function_basenames.Decode(data, offset_ptr, strtab))
+        return false;
+      break;
+    case kDataIDFunctionFullnames:
+      if (!function_fullnames.Decode(data, offset_ptr, strtab))
+        return false;
+      break;
+    case kDataIDFunctionMethods:
+      if (!function_methods.Decode(data, offset_ptr, strtab))
+        return false;
+      break;
+    case kDataIDFunctionSelectors:
+      if (!function_selectors.Decode(data, offset_ptr, strtab))
+        return false;
+      break;
+    case kDataIDFunctionObjcClassSelectors:
+      if (!objc_class_selectors.Decode(data, offset_ptr, strtab))
+        return false;
+      break;
+    case kDataIDGlobals:
+      if (!globals.Decode(data, offset_ptr, strtab))
+        return false;
+      break;
+    case kDataIDTypes:
+      if (!types.Decode(data, offset_ptr, strtab))
+        return false;
+      break;
+    case kDataIDNamespaces:
+      if (!namespaces.Decode(data, offset_ptr, strtab))
+        return false;
+      break;
+    case kDataIDEnd:
+      // We got to the end of our NameToDIE encodings.
+      done = true;
+      break;
+    }
+  }
+  // Success!
+  return true;
+}
+
+void ManualDWARFIndex::IndexSet::Encode(DataEncoder &encoder) const {
+  ConstStringTable strtab;
+
+  // Encoder the DWARF index into a separate encoder first. This allows us
+  // gather all of the strings we willl need in "strtab" as we will need to
+  // write the string table out before the symbol table.
+  DataEncoder index_encoder(encoder.GetByteOrder(),
+                            encoder.GetAddressByteSize());
+
+  index_encoder.AppendData(kIdentifierManualDWARFIndex);
+  // Encode the data version.
+  index_encoder.AppendU32(CURRENT_CACHE_VERSION);
+
+  if (!function_basenames.IsEmpty()) {
+    index_encoder.AppendU8(kDataIDFunctionBasenames);
+    function_basenames.Encode(index_encoder, strtab);
+  }
+  if (!function_fullnames.IsEmpty()) {
+    index_encoder.AppendU8(kDataIDFunctionFullnames);
+    function_fullnames.Encode(index_encoder, strtab);
+  }
+  if (!function_methods.IsEmpty()) {
+    index_encoder.AppendU8(kDataIDFunctionMethods);
+    function_methods.Encode(index_encoder, strtab);
+  }
+  if (!function_selectors.IsEmpty()) {
+    index_encoder.AppendU8(kDataIDFunctionSelectors);
+    function_selectors.Encode(index_encoder, strtab);
+  }
+  if (!objc_class_selectors.IsEmpty()) {
+    index_encoder.AppendU8(kDataIDFunctionObjcClassSelectors);
+    objc_class_selectors.Encode(index_encoder, strtab);
+  }
+  if (!globals.IsEmpty()) {
+    index_encoder.AppendU8(kDataIDGlobals);
+    globals.Encode(index_encoder, strtab);
+  }
+  if (!types.IsEmpty()) {
+    index_encoder.AppendU8(kDataIDTypes);
+    types.Encode(index_encoder, strtab);
+  }
+  if (!namespaces.IsEmpty()) {
+    index_encoder.AppendU8(kDataIDNamespaces);
+    namespaces.Encode(index_encoder, strtab);
+  }
+  index_encoder.AppendU8(kDataIDEnd);
+
+  // Now that all strings have been gathered, we will emit the string table.
+  strtab.Encode(encoder);
+  // Followed the the symbol table data.
+  encoder.AppendData(index_encoder.GetData());
+}
+
+bool ManualDWARFIndex::Decode(const DataExtractor &data,
+                              lldb::offset_t *offset_ptr,
+                              bool &signature_mismatch) {
+  signature_mismatch = false;
+  CacheSignature signature;
+  if (!signature.Decode(data, offset_ptr))
+    return false;
+  if (CacheSignature(m_dwarf->GetObjectFile()) != signature) {
+    signature_mismatch = true;
+    return false;
+  }
+  IndexSet set;
+  if (!set.Decode(data, offset_ptr))
+    return false;
+  m_set = std::move(set);
+  return true;
+}
+
+bool ManualDWARFIndex::Encode(DataEncoder &encoder) const {
+  CacheSignature signature(m_dwarf->GetObjectFile());
+  if (!signature.Encode(encoder))
+    return false;
+  m_set.Encode(encoder);
+  return true;
+}
+
+std::string ManualDWARFIndex::GetCacheKey() {
+  std::string key;
+  llvm::raw_string_ostream strm(key);
+  // DWARF Index can come from different object files for the same module. A
+  // module can have one object file as the main executable and might have
+  // another object file in a separate symbol file, or we might have a .dwo file
+  // that claims its module is the main executable.
+  ObjectFile *objfile = m_dwarf->GetObjectFile();
+  strm << objfile->GetModule()->GetCacheKey() << "-dwarf-index-"
+      << llvm::format_hex(objfile->GetCacheHash(), 10);
+  return strm.str();
+}
+
+bool ManualDWARFIndex::LoadFromCache() {
+  DataFileCache *cache = Module::GetIndexCache();
+  if (!cache)
+    return false;
+  ObjectFile *objfile = m_dwarf->GetObjectFile();
+  if (!objfile)
+    return false;
+  std::unique_ptr<llvm::MemoryBuffer> mem_buffer_up =
+      cache->GetCachedData(GetCacheKey());
+  if (!mem_buffer_up)
+    return false;
+  DataExtractor data(mem_buffer_up->getBufferStart(),
+                     mem_buffer_up->getBufferSize(),
+                     endian::InlHostByteOrder(),
+                     objfile->GetAddressByteSize());
+  bool signature_mismatch = false;
+  lldb::offset_t offset = 0;
+  const bool result = Decode(data, &offset, signature_mismatch);
+  if (signature_mismatch)
+    cache->RemoveCacheFile(GetCacheKey());
+  return result;
+}
+
+void ManualDWARFIndex::SaveToCache() {
+  DataFileCache *cache = Module::GetIndexCache();
+  if (!cache)
+    return; // Caching is not enabled.
+  ObjectFile *objfile = m_dwarf->GetObjectFile();
+  if (!objfile)
+    return;
+  DataEncoder file(endian::InlHostByteOrder(), objfile->GetAddressByteSize());
+  // Encode will return false if the object file doesn't have anything to make
+  // a signature from.
+  if (Encode(file)) {
+    if (cache->SetCachedData(GetCacheKey(), file.GetData()))
+      m_dwarf->SetDebugInfoIndexWasSavedToCache();
+  }
+}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.h
--- a/lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.h
@@ -48,6 +48,44 @@
                              const DIERef &die_ref)> const
               &callback) const;
 
+  /// Decode a serialized version of this object from data.
+  ///
+  /// \param data
+  ///   The decoder object that references the serialized data.
+  ///
+  /// \param offset_ptr
+  ///   A pointer that contains the offset from which the data will be decoded
+  ///   from that gets updated as data gets decoded.
+  ///
+  /// \param strtab
+  ///   All strings in cache files are put into string tables for efficiency
+  ///   and cache file size reduction. Strings are stored as uint32_t string
+  ///   table offsets in the cache data.
+  bool Decode(const lldb_private::DataExtractor &data,
+              lldb::offset_t *offset_ptr,
+              const lldb_private::StringTableReader &strtab);
+
+  /// Encode this object into a data encoder object.
+  ///
+  /// This allows this object to be serialized to disk.
+  ///
+  /// \param encoder
+  ///   A data encoder object that serialized bytes will be encoded into.
+  ///
+  /// \param strtab
+  ///   All strings in cache files are put into string tables for efficiency
+  ///   and cache file size reduction. Strings are stored as uint32_t string
+  ///   table offsets in the cache data.
+  void Encode(lldb_private::DataEncoder &encoder,
+              lldb_private::ConstStringTable &strtab) const;
+
+  /// Used for unit testing the encoding and decoding.
+  bool operator==(const NameToDIE &rhs) const;
+
+  bool IsEmpty() const { return m_map.IsEmpty(); }
+
+  void Clear() { m_map.Clear(); }
+
 protected:
   lldb_private::UniqueCStringMap<DIERef> m_map;
 };
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.cpp
--- a/lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/NameToDIE.cpp
@@ -8,8 +8,11 @@
 
 #include "NameToDIE.h"
 #include "DWARFUnit.h"
+#include "lldb/Core/DataFileCache.h"
 #include "lldb/Symbol/ObjectFile.h"
 #include "lldb/Utility/ConstString.h"
+#include "lldb/Utility/DataEncoder.h"
+#include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/RegularExpression.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StreamString.h"
@@ -87,3 +90,50 @@
                  other.m_map.GetValueAtIndexUnchecked(i));
   }
 }
+
+constexpr llvm::StringLiteral kIdentifierNameToDIE("N2DI");
+
+bool NameToDIE::Decode(const DataExtractor &data, lldb::offset_t *offset_ptr,
+                       const StringTableReader &strtab) {
+  m_map.Clear();
+  llvm::StringRef identifier((const char *)data.GetData(offset_ptr, 4), 4);
+  if (identifier != kIdentifierNameToDIE)
+    return false;
+  const uint32_t count = data.GetU32(offset_ptr);
+  for (uint32_t i = 0; i < count; ++i) {
+    llvm::StringRef str(strtab.Get(data.GetU32(offset_ptr)));
+    // No empty strings allowed in the name to DIE maps.
+    if (str.empty())
+      return false;
+    if (llvm::Optional<DIERef> die_ref = DIERef::Decode(data, offset_ptr))
+      m_map.Append(ConstString(str), die_ref.getValue());
+    else
+      return false;
+  }
+  return true;
+}
+
+void NameToDIE::Encode(DataEncoder &encoder, ConstStringTable &strtab) const {
+  encoder.AppendData(kIdentifierNameToDIE);
+  encoder.AppendU32(m_map.GetSize());
+  for (const auto &entry : m_map) {
+    // Make sure there are no empty strings.
+    assert((bool)entry.cstring);
+    encoder.AppendU32(strtab.Add(entry.cstring));
+    entry.value.Encode(encoder);
+  }
+}
+
+bool NameToDIE::operator==(const NameToDIE &rhs) const {
+  const size_t size = m_map.GetSize();
+  if (size != rhs.m_map.GetSize())
+    return false;
+  for (size_t i = 0; i < size; ++i) {
+    if (m_map.GetCStringAtIndex(i) != rhs.m_map.GetCStringAtIndex(i))
+      return false;
+    if (m_map.GetValueRefAtIndexUnchecked(i) !=
+        rhs.m_map.GetValueRefAtIndexUnchecked(i))
+      return false;
+  }
+  return true;
+}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -2140,7 +2140,8 @@
 
   llvm::StringRef basename;
   llvm::StringRef context;
-  bool name_is_mangled = (bool)Mangled(name);
+  bool name_is_mangled = Mangled::GetManglingScheme(name.GetStringRef()) !=
+                         Mangled::eManglingSchemeNone;
 
   if (!CPlusPlusLanguage::ExtractContextAndIdentifier(name.GetCString(),
                                                       context, basename))
diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -34,7 +34,8 @@
 Symtab::Symtab(ObjectFile *objfile)
     : m_objfile(objfile), m_symbols(), m_file_addr_to_index(*this),
       m_name_to_symbol_indices(), m_mutex(),
-      m_file_addr_to_index_computed(false), m_name_indexes_computed(false) {
+      m_file_addr_to_index_computed(false), m_name_indexes_computed(false),
+      m_loaded_from_cache(false), m_saved_to_cache(false) {
   m_name_to_symbol_indices.emplace(std::make_pair(
       lldb::eFunctionNameTypeNone, UniqueCStringMap<uint32_t>()));
   m_name_to_symbol_indices.emplace(std::make_pair(
@@ -1179,7 +1180,8 @@
   // Encode will return false if the symbol table's object file doesn't have
   // anything to make a signature from.
   if (Encode(file))
-    cache->SetCachedData(GetCacheKey(), file.GetData());
+    if (cache->SetCachedData(GetCacheKey(), file.GetData()))
+      SetWasSavedToCache();
 }
 
 constexpr llvm::StringLiteral kIdentifierCStrMap("CMAP");
@@ -1343,5 +1345,7 @@
   const bool result = Decode(data, &offset, signature_mismatch);
   if (signature_mismatch)
     cache->RemoveCacheFile(GetCacheKey());
+  if (result)
+    SetWasLoadedFromCache();
   return result;
 }
diff --git a/lldb/source/Target/Statistics.cpp b/lldb/source/Target/Statistics.cpp
--- a/lldb/source/Target/Statistics.cpp
+++ b/lldb/source/Target/Statistics.cpp
@@ -52,9 +52,15 @@
   module.try_emplace("identifier", identifier);
   module.try_emplace("symbolTableParseTime", symtab_parse_time);
   module.try_emplace("symbolTableIndexTime", symtab_index_time);
+  module.try_emplace("symbolTableLoadedFromCache", symtab_loaded_from_cache);
+  module.try_emplace("symbolTableSavedToCache", symtab_saved_to_cache);
   module.try_emplace("debugInfoParseTime", debug_parse_time);
   module.try_emplace("debugInfoIndexTime", debug_index_time);
   module.try_emplace("debugInfoByteSize", (int64_t)debug_info_size);
+  module.try_emplace("debugInfoIndexLoadedFromCache",
+                     debug_info_index_loaded_from_cache);
+  module.try_emplace("debugInfoIndexSavedToCache",
+                     debug_info_index_saved_to_cache);
   return module;
 }
 
@@ -144,6 +150,10 @@
   double symtab_index_time = 0.0;
   double debug_parse_time = 0.0;
   double debug_index_time = 0.0;
+  uint32_t symtabs_loaded = 0;
+  uint32_t symtabs_saved = 0;
+  uint32_t debug_index_loaded = 0;
+  uint32_t debug_index_saved = 0;
   uint64_t debug_info_size = 0;
   if (target) {
     json_targets.emplace_back(target->ReportStatistics());
@@ -169,11 +179,28 @@
     module_stat.triple = module->GetArchitecture().GetTriple().str();
     module_stat.symtab_parse_time = module->GetSymtabParseTime().count();
     module_stat.symtab_index_time = module->GetSymtabIndexTime().count();
+    Symtab *symtab = module->GetSymtab();
+    if (symtab) {
+      module_stat.symtab_loaded_from_cache = symtab->GetWasLoadedFromCache();
+      if (module_stat.symtab_loaded_from_cache)
+        ++symtabs_loaded;
+      module_stat.symtab_saved_to_cache = symtab->GetWasSavedToCache();
+      if (module_stat.symtab_saved_to_cache)
+        ++symtabs_saved;
+    }
     SymbolFile *sym_file = module->GetSymbolFile();
     if (sym_file) {
       module_stat.debug_index_time = sym_file->GetDebugInfoIndexTime().count();
       module_stat.debug_parse_time = sym_file->GetDebugInfoParseTime().count();
       module_stat.debug_info_size = sym_file->GetDebugInfoSize();
+      module_stat.debug_info_index_loaded_from_cache =
+          sym_file->GetDebugInfoIndexWasLoadedFromCache();
+      if (module_stat.debug_info_index_loaded_from_cache)
+        ++debug_index_loaded;
+      module_stat.debug_info_index_saved_to_cache =
+          sym_file->GetDebugInfoIndexWasSavedToCache();
+      if (module_stat.debug_info_index_saved_to_cache)
+        ++debug_index_saved;
     }
     symtab_parse_time += module_stat.symtab_parse_time;
     symtab_index_time += module_stat.symtab_index_time;
@@ -188,8 +215,12 @@
       {"modules", std::move(json_modules)},
       {"totalSymbolTableParseTime", symtab_parse_time},
       {"totalSymbolTableIndexTime", symtab_index_time},
+      {"totalSymbolTablesLoadedFromCache", symtabs_loaded},
+      {"totalSymbolTablesSavedToCache", symtabs_saved},
       {"totalDebugInfoParseTime", debug_parse_time},
       {"totalDebugInfoIndexTime", debug_index_time},
+      {"totalDebugInfoIndexLoadedFromCache", debug_index_loaded},
+      {"totalDebugInfoIndexSavedToCache", debug_index_saved},
       {"totalDebugInfoByteSize", debug_info_size},
   };
   return std::move(global_stats);
diff --git a/lldb/source/Utility/Log.cpp b/lldb/source/Utility/Log.cpp
--- a/lldb/source/Utility/Log.cpp
+++ b/lldb/source/Utility/Log.cpp
@@ -30,7 +30,6 @@
 #include <process.h>
 #else
 #include <unistd.h>
-#include <pthread.h>
 #endif
 
 using namespace lldb_private;
@@ -180,9 +179,6 @@
 }
 
 void Log::Initialize() {
-#ifdef LLVM_ON_UNIX
-  pthread_atfork(nullptr, nullptr, &Log::DisableLoggingChild);
-#endif
   InitializeLldbChannel();
 }
 
@@ -346,11 +342,3 @@
   message << payload << "\n";
   WriteMessage(message.str());
 }
-
-void Log::DisableLoggingChild() {
-  // Disable logging by clearing out the atomic variable after forking -- if we
-  // forked while another thread held the channel mutex, we would deadlock when
-  // trying to write to the log.
-  for (auto &c: *g_channel_map)
-    c.second.m_channel.log_ptr.store(nullptr, std::memory_order_relaxed);
-}
diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py
--- a/lldb/test/API/commands/statistics/basic/TestStats.py
+++ b/lldb/test/API/commands/statistics/basic/TestStats.py
@@ -164,8 +164,12 @@
             'targets',
             'totalSymbolTableParseTime',
             'totalSymbolTableIndexTime',
+            'totalSymbolTablesLoadedFromCache',
+            'totalSymbolTablesSavedToCache',
             'totalDebugInfoByteSize',
             'totalDebugInfoIndexTime',
+            'totalDebugInfoIndexLoadedFromCache',
+            'totalDebugInfoIndexSavedToCache',
             'totalDebugInfoParseTime',
         ]
         self.verify_keys(debug_stats, '"debug_stats"', debug_stat_keys, None)
@@ -227,8 +231,12 @@
             'targets',
             'totalSymbolTableParseTime',
             'totalSymbolTableIndexTime',
+            'totalSymbolTablesLoadedFromCache',
+            'totalSymbolTablesSavedToCache',
             'totalDebugInfoByteSize',
             'totalDebugInfoIndexTime',
+            'totalDebugInfoIndexLoadedFromCache',
+            'totalDebugInfoIndexSavedToCache',
             'totalDebugInfoParseTime',
         ]
         self.verify_keys(debug_stats, '"debug_stats"', debug_stat_keys, None)
@@ -265,8 +273,12 @@
             'targets',
             'totalSymbolTableParseTime',
             'totalSymbolTableIndexTime',
+            'totalSymbolTablesLoadedFromCache',
+            'totalSymbolTablesSavedToCache',
             'totalDebugInfoParseTime',
             'totalDebugInfoIndexTime',
+            'totalDebugInfoIndexLoadedFromCache',
+            'totalDebugInfoIndexSavedToCache',
             'totalDebugInfoByteSize'
         ]
         self.verify_keys(debug_stats, '"debug_stats"', debug_stat_keys, None)
@@ -278,12 +290,16 @@
         exe_module = self.find_module_in_metrics(exe, debug_stats)
         module_keys = [
             'debugInfoByteSize',
+            'debugInfoIndexLoadedFromCache',
             'debugInfoIndexTime',
+            'debugInfoIndexSavedToCache',
             'debugInfoParseTime',
             'identifier',
             'path',
             'symbolTableIndexTime',
+            'symbolTableLoadedFromCache',
             'symbolTableParseTime',
+            'symbolTableSavedToCache',
             'triple',
             'uuid',
         ]
@@ -343,8 +359,12 @@
             'targets',
             'totalSymbolTableParseTime',
             'totalSymbolTableIndexTime',
+            'totalSymbolTablesLoadedFromCache',
+            'totalSymbolTablesSavedToCache',
             'totalDebugInfoParseTime',
             'totalDebugInfoIndexTime',
+            'totalDebugInfoIndexLoadedFromCache',
+            'totalDebugInfoIndexSavedToCache',
             'totalDebugInfoByteSize',
         ]
         self.verify_keys(debug_stats, '"debug_stats"', debug_stat_keys, None)
diff --git a/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py b/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py
new file mode 100644
--- /dev/null
+++ b/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py
@@ -0,0 +1,141 @@
+import glob
+import json
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+import os
+import time
+
+
+class DebugIndexCacheTestcase(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    def setUp(self):
+        # Call super's setUp().
+        TestBase.setUp(self)
+        # Set the lldb module cache directory to a directory inside the build
+        # artifacts directory so no other tests are interfered with.
+        self.cache_dir = os.path.join(self.getBuildDir(), 'lldb-module-cache')
+
+    def get_module_cache_files(self, basename):
+        module_cache_glob = os.path.join(self.cache_dir,
+                                         "llvmcache-*%s*dwarf-index*" % (basename))
+        return glob.glob(module_cache_glob)
+
+    def get_stats(self, log_path=None):
+        """
+            Get the output of the "statistics dump" and return the JSON as a
+            python dictionary.
+        """
+        # If log_path is set, open the path and emit the output of the command
+        # for debugging purposes.
+        if log_path is not None:
+            f = open(log_path, 'w')
+        else:
+            f = None
+        return_obj = lldb.SBCommandReturnObject()
+        command = "statistics dump "
+        if f:
+            f.write('(lldb) %s\n' % (command))
+        self.ci.HandleCommand(command, return_obj, False)
+        metrics_json = return_obj.GetOutput()
+        if f:
+            f.write(metrics_json)
+        return json.loads(metrics_json)
+
+    def enable_lldb_index_cache(self):
+        self.runCmd('settings set symbols.lldb-index-cache-path "%s"' % (self.cache_dir))
+        self.runCmd('settings set symbols.enable-lldb-index-cache true')
+
+    @no_debug_info_test
+    def test_with_caching_enabled(self):
+        """
+            Test module cache functionality for debug info index caching.
+
+            We test that a debug info index file is created for the debug
+            information when caching is enabled with a file that contains
+            at least one of each kind of DIE in ManualDWARFIndex::IndexSet.
+
+            The input file has DWARF that will fill in every member of the
+            ManualDWARFIndex::IndexSet class to ensure we can encode all of the
+            required information.
+
+            With caching enabled, we also verify that the appropriate statistics
+            specify that the cache file was saved to the cache.
+        """
+        self.enable_lldb_index_cache()
+        src_dir = self.getSourceDir()
+        yaml_path = os.path.join(src_dir, "exe.yaml")
+        yaml_base, ext = os.path.splitext(yaml_path)
+        obj_path = self.getBuildArtifact("main.o")
+        self.yaml2obj(yaml_path, obj_path)
+
+        # Create a target with the object file we just created from YAML
+        target = self.dbg.CreateTarget(obj_path)
+        self.assertTrue(target, VALID_TARGET)
+
+        debug_index_cache_files = self.get_module_cache_files('main.o')
+        self.assertEqual(len(debug_index_cache_files), 1,
+                "make sure there is one file in the module cache directory (%s) for main.o that is a debug info cache" % (self.cache_dir))
+
+        # Verify that the module statistics have the information that specifies
+        # if we loaded or saved the debug index and symtab to the cache
+        stats = self.get_stats()
+        module_stats = stats['modules'][0]
+        self.assertFalse(module_stats['debugInfoIndexLoadedFromCache'])
+        self.assertTrue(module_stats['debugInfoIndexSavedToCache'])
+        self.assertFalse(module_stats['symbolTableLoadedFromCache'])
+        self.assertTrue(module_stats['symbolTableSavedToCache'])
+        # Verify the top level stats track how many things were loaded or saved
+        # to the cache.
+        self.assertEqual(stats["totalDebugInfoIndexLoadedFromCache"], 0)
+        self.assertEqual(stats["totalDebugInfoIndexSavedToCache"], 1)
+        self.assertEqual(stats["totalSymbolTablesLoadedFromCache"], 0)
+        self.assertEqual(stats["totalSymbolTablesSavedToCache"], 1)
+
+    @no_debug_info_test
+    def test_with_caching_disabled(self):
+        """
+            Test module cache functionality for debug info index caching.
+
+            We test that a debug info index file is not created for the debug
+            information when caching is disabled with a file that contains
+            at least one of each kind of DIE in ManualDWARFIndex::IndexSet.
+
+            The input file has DWARF that will fill in every member of the
+            ManualDWARFIndex::IndexSet class to ensure we can encode all of the
+            required information.
+
+            With caching disabled, we also verify that the appropriate
+            statistics specify that the cache file was not saved to the cache.
+        """
+        src_dir = self.getSourceDir()
+        yaml_path = os.path.join(src_dir, "exe.yaml")
+        yaml_base, ext = os.path.splitext(yaml_path)
+        obj_path = self.getBuildArtifact("main.o")
+        self.yaml2obj(yaml_path, obj_path)
+
+        # Create a target with the object file we just created from YAML
+        target = self.dbg.CreateTarget(obj_path)
+        self.assertTrue(target, VALID_TARGET)
+
+        debug_index_cache_files = self.get_module_cache_files('main.o')
+        self.assertEqual(len(debug_index_cache_files), 0,
+                "make sure there is no file in the module cache directory (%s) for main.o that is a debug info cache" % (self.cache_dir))
+
+        # Verify that the module statistics have the information that specifies
+        # if we loaded or saved the debug index and symtab to the cache
+        stats = self.get_stats()
+        module_stats = stats['modules'][0]
+        self.assertFalse(module_stats['debugInfoIndexLoadedFromCache'])
+        self.assertFalse(module_stats['debugInfoIndexSavedToCache'])
+        self.assertFalse(module_stats['symbolTableLoadedFromCache'])
+        self.assertFalse(module_stats['symbolTableSavedToCache'])
+        # Verify the top level stats track how many things were loaded or saved
+        # to the cache.
+        self.assertEqual(stats["totalDebugInfoIndexLoadedFromCache"], 0)
+        self.assertEqual(stats["totalDebugInfoIndexSavedToCache"], 0)
+        self.assertEqual(stats["totalSymbolTablesLoadedFromCache"], 0)
+        self.assertEqual(stats["totalSymbolTablesSavedToCache"], 0)
diff --git a/lldb/test/API/functionalities/module_cache/debug_index/exe.yaml b/lldb/test/API/functionalities/module_cache/debug_index/exe.yaml
new file mode 100644
--- /dev/null
+++ b/lldb/test/API/functionalities/module_cache/debug_index/exe.yaml
@@ -0,0 +1,844 @@
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_386
+DWARF:
+  debug_str:
+    - 'Apple clang version 13.0.0 (clang-1300.0.29.3)'
+    - main.mm
+    - '/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk'
+    - MacOSX.sdk
+    - '/tmp/test'
+    - g_global
+    - int
+    - SimpleClass
+    - NSObject
+    - isa
+    - Class
+    - objc_class
+    - foo
+    - _Z3fooi
+    - '-[SimpleClass sayHello]'
+    - sayHello
+    - main
+    - baz
+    - Bar
+    - x
+    - _ZNK3baz3Bar3getEv
+    - get
+    - _ZN3baz3BarC1Ei
+    - _ZN3baz3BarC2Ei
+    - self
+    - _cmd
+    - SEL
+    - objc_selector
+    - argc
+    - argv
+    - char
+    - b
+    - this
+    - i
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_producer
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_LLVM_sysroot
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_APPLE_sdk
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+            - Attribute:       DW_AT_comp_dir
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_APPLE_major_runtime_vers
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+        - Code:            0x2
+          Tag:             DW_TAG_variable
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_location
+              Form:            DW_FORM_exprloc
+        - Code:            0x3
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_encoding
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_APPLE_objc_complete_type
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_APPLE_runtime_class
+              Form:            DW_FORM_data1
+        - Code:            0x5
+          Tag:             DW_TAG_inheritance
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+        - Code:            0x6
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_APPLE_runtime_class
+              Form:            DW_FORM_data1
+        - Code:            0x7
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_accessibility
+              Form:            DW_FORM_data1
+        - Code:            0x8
+          Tag:             DW_TAG_typedef
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+        - Code:            0x9
+          Tag:             DW_TAG_pointer_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+        - Code:            0xA
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+        - Code:            0xB
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_APPLE_omit_frame_ptr
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_frame_base
+              Form:            DW_FORM_exprloc
+            - Attribute:       DW_AT_linkage_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0xC
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_location
+              Form:            DW_FORM_exprloc
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+        - Code:            0xD
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_APPLE_omit_frame_ptr
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_frame_base
+              Form:            DW_FORM_exprloc
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+        - Code:            0xE
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_location
+              Form:            DW_FORM_exprloc
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+        - Code:            0xF
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_frame_base
+              Form:            DW_FORM_exprloc
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0x10
+          Tag:             DW_TAG_variable
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_location
+              Form:            DW_FORM_exprloc
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+        - Code:            0x11
+          Tag:             DW_TAG_namespace
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x12
+          Tag:             DW_TAG_class_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_calling_convention
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+        - Code:            0x13
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+        - Code:            0x14
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_accessibility
+              Form:            DW_FORM_data1
+        - Code:            0x15
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+        - Code:            0x16
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+        - Code:            0x17
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_linkage_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_decl_file
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_decl_line
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_accessibility
+              Form:            DW_FORM_data1
+        - Code:            0x18
+          Tag:             DW_TAG_const_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+        - Code:            0x19
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_frame_base
+              Form:            DW_FORM_exprloc
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_linkage_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_specification
+              Form:            DW_FORM_ref4
+        - Code:            0x1A
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_APPLE_omit_frame_ptr
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_frame_base
+              Form:            DW_FORM_exprloc
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_linkage_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_specification
+              Form:            DW_FORM_ref4
+  debug_info:
+    - Length:          0x21F
+      Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x11
+            - Value:           0x2F
+            - Value:           0x37
+            - Value:           0x96
+            - Value:           0x0
+            - Value:           0xA1
+            - Value:           0x2
+            - Value:           0x0
+            - Value:           0xC4
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xAB
+            - Value:           0x48
+            - Value:           0x1
+            - Value:           0x1
+            - Value:           0x3
+            - Value:           0x9
+              BlockData:       [ 0x3, 0xC4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                                 0x0 ]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xB4
+            - Value:           0x5
+            - Value:           0x4
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x1
+            - Value:           0xB8
+            - Value:           0x8
+            - Value:           0x1
+            - Value:           0x13
+            - Value:           0x11
+        - AbbrCode:        0x5
+          Values:
+            - Value:           0x5F
+            - Value:           0x0
+        - AbbrCode:        0x0
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0xC4
+            - Value:           0x8
+            - Value:           0x2
+            - Value:           0x35
+            - Value:           0x11
+        - AbbrCode:        0x7
+          Values:
+            - Value:           0xCD
+            - Value:           0x76
+            - Value:           0x2
+            - Value:           0x38
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x0
+        - AbbrCode:        0x8
+          Values:
+            - Value:           0x81
+            - Value:           0xD1
+            - Value:           0x1
+            - Value:           0xD
+        - AbbrCode:        0x9
+          Values:
+            - Value:           0x86
+        - AbbrCode:        0xA
+          Values:
+            - Value:           0xD7
+            - Value:           0x1
+        - AbbrCode:        0xB
+          Values:
+            - Value:           0x0
+            - Value:           0x20
+            - Value:           0x1
+            - Value:           0x1
+              BlockData:       [ 0x6F ]
+            - Value:           0xE6
+            - Value:           0xE2
+            - Value:           0x1
+            - Value:           0x6
+            - Value:           0x48
+            - Value:           0x1
+        - AbbrCode:        0xC
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x91, 0xC ]
+            - Value:           0x11C
+            - Value:           0x1
+            - Value:           0x6
+            - Value:           0x48
+        - AbbrCode:        0x0
+        - AbbrCode:        0xD
+          Values:
+            - Value:           0x20
+            - Value:           0x14
+            - Value:           0x1
+            - Value:           0x1
+              BlockData:       [ 0x6F ]
+            - Value:           0xD0
+            - Value:           0xEE
+            - Value:           0x1
+            - Value:           0x18
+        - AbbrCode:        0xE
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x91, 0x8 ]
+            - Value:           0x155
+            - Value:           0x1ED
+            - Value:           0x1
+        - AbbrCode:        0xE
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x91, 0x0 ]
+            - Value:           0x15A
+            - Value:           0x1F2
+            - Value:           0x1
+        - AbbrCode:        0x0
+        - AbbrCode:        0xF
+          Values:
+            - Value:           0x34
+            - Value:           0x3C
+            - Value:           0x1
+              BlockData:       [ 0x6D ]
+            - Value:           0x10F
+            - Value:           0x1
+            - Value:           0x1B
+            - Value:           0x48
+            - Value:           0x1
+        - AbbrCode:        0xC
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x91, 0x78 ]
+            - Value:           0x171
+            - Value:           0x1
+            - Value:           0x1B
+            - Value:           0x48
+        - AbbrCode:        0xC
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x8F, 0x10 ]
+            - Value:           0x176
+            - Value:           0x1
+            - Value:           0x1B
+            - Value:           0x207
+        - AbbrCode:        0x10
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x8F, 0xC ]
+            - Value:           0x180
+            - Value:           0x1
+            - Value:           0x1C
+            - Value:           0x132
+        - AbbrCode:        0x0
+        - AbbrCode:        0x11
+          Values:
+            - Value:           0x114
+        - AbbrCode:        0x12
+          Values:
+            - Value:           0x5
+            - Value:           0x118
+            - Value:           0x4
+            - Value:           0x1
+            - Value:           0xA
+        - AbbrCode:        0x13
+          Values:
+            - Value:           0x11C
+            - Value:           0x48
+            - Value:           0x1
+            - Value:           0xB
+            - Value:           0x0
+        - AbbrCode:        0x14
+          Values:
+            - Value:           0x118
+            - Value:           0x1
+            - Value:           0xD
+            - Value:           0x1
+            - Value:           0x1
+            - Value:           0x1
+        - AbbrCode:        0x15
+          Values:
+            - Value:           0x172
+            - Value:           0x1
+        - AbbrCode:        0x16
+          Values:
+            - Value:           0x48
+        - AbbrCode:        0x0
+        - AbbrCode:        0x17
+          Values:
+            - Value:           0x11E
+            - Value:           0x131
+            - Value:           0x1
+            - Value:           0xF
+            - Value:           0x48
+            - Value:           0x1
+            - Value:           0x1
+            - Value:           0x1
+        - AbbrCode:        0x15
+          Values:
+            - Value:           0x177
+            - Value:           0x1
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+        - AbbrCode:        0x9
+          Values:
+            - Value:           0x132
+        - AbbrCode:        0x9
+          Values:
+            - Value:           0x17C
+        - AbbrCode:        0x18
+          Values:
+            - Value:           0x132
+        - AbbrCode:        0x19
+          Values:
+            - Value:           0x70
+            - Value:           0x34
+            - Value:           0x1
+              BlockData:       [ 0x6D ]
+            - Value:           0x19C
+            - Value:           0x135
+            - Value:           0x147
+        - AbbrCode:        0xE
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x91, 0x78 ]
+            - Value:           0x182
+            - Value:           0x21D
+            - Value:           0x1
+        - AbbrCode:        0xC
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x91, 0x74 ]
+            - Value:           0x187
+            - Value:           0x1
+            - Value:           0xD
+            - Value:           0x48
+        - AbbrCode:        0x0
+        - AbbrCode:        0x1A
+          Values:
+            - Value:           0xA4
+            - Value:           0x20
+            - Value:           0x1
+            - Value:           0x1
+              BlockData:       [ 0x6F ]
+            - Value:           0x1D2
+            - Value:           0x145
+            - Value:           0x147
+        - AbbrCode:        0xE
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x91, 0x8 ]
+            - Value:           0x182
+            - Value:           0x21D
+            - Value:           0x1
+        - AbbrCode:        0xC
+          Values:
+            - Value:           0x2
+              BlockData:       [ 0x91, 0x4 ]
+            - Value:           0x187
+            - Value:           0x1
+            - Value:           0xD
+            - Value:           0x48
+        - AbbrCode:        0x0
+        - AbbrCode:        0x9
+          Values:
+            - Value:           0x4F
+        - AbbrCode:        0x8
+          Values:
+            - Value:           0x1FD
+            - Value:           0x15F
+            - Value:           0x1
+            - Value:           0x8
+        - AbbrCode:        0x9
+          Values:
+            - Value:           0x202
+        - AbbrCode:        0xA
+          Values:
+            - Value:           0x163
+            - Value:           0x1
+        - AbbrCode:        0x9
+          Values:
+            - Value:           0x20C
+        - AbbrCode:        0x9
+          Values:
+            - Value:           0x211
+        - AbbrCode:        0x18
+          Values:
+            - Value:           0x216
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x17B
+            - Value:           0x6
+            - Value:           0x1
+        - AbbrCode:        0x9
+          Values:
+            - Value:           0x132
+        - AbbrCode:        0x0
+  debug_line:
+    - Length:          250
+      Version:         4
+      PrologueLength:  157
+      MinInstLength:   1
+      MaxOpsPerInst:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      IncludeDirs:
+        - '/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include/objc'
+      Files:
+        - Name:            main.mm
+          DirIdx:          0
+          ModTime:         0
+          Length:          0
+        - Name:            NSObject.h
+          DirIdx:          1
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            0
+        - Opcode:          0x17
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            10
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            0
+        - Opcode:          0x83
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            0
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          DW_LNS_advance_line
+          SData:           -7
+          Data:            0
+        - Opcode:          0x4A
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            14
+        - Opcode:          0x51
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            12
+        - Opcode:          0x4A
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            3
+        - Opcode:          0x4A
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            0
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          DW_LNS_advance_line
+          SData:           17
+          Data:            0
+        - Opcode:          0x82
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            20
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            0
+        - Opcode:          0xBA
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            0
+        - Opcode:          0x85
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            12
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            0
+        - Opcode:          DW_LNS_advance_pc
+          Data:            40
+        - Opcode:          0x13
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            3
+        - Opcode:          0x83
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            0
+        - Opcode:          DW_LNS_advance_line
+          SData:           -16
+          Data:            0
+        - Opcode:          0xBA
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            21
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            0
+        - Opcode:          DW_LNS_const_add_pc
+          Data:            0
+        - Opcode:          0xAC
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            22
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0xBA
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            0
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0xBA
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            18
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            0
+        - Opcode:          0xF2
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            16
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x4A
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            22
+        - Opcode:          0x4A
+          Data:            0
+        - Opcode:          DW_LNS_advance_pc
+          Data:            8
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            0
+...
diff --git a/lldb/test/API/functionalities/module_cache/simple_exe/TestModuleCacheSimple.py b/lldb/test/API/functionalities/module_cache/simple_exe/TestModuleCacheSimple.py
--- a/lldb/test/API/functionalities/module_cache/simple_exe/TestModuleCacheSimple.py
+++ b/lldb/test/API/functionalities/module_cache/simple_exe/TestModuleCacheSimple.py
@@ -26,12 +26,13 @@
 
 
     def get_module_cache_files(self, basename):
-        module_file_glob = os.path.join(self.cache_dir, "llvmcache-*%s*" % (basename))
+        module_file_glob = os.path.join(self.cache_dir,
+                "llvmcache-*%s*-symtab-*" % (basename))
         return glob.glob(module_file_glob)
 
     # Doesn't depend on any specific debug information.
     @no_debug_info_test
-    @skipIfWindows # Windows runs into trouble deleting the executable
+    @skipIfWindows
     def test(self):
         """
             Test module cache functionality for a simple object file.
diff --git a/lldb/test/Shell/Settings/TestSettingsSet.test b/lldb/test/Shell/Settings/TestSettingsSet.test
--- a/lldb/test/Shell/Settings/TestSettingsSet.test
+++ b/lldb/test/Shell/Settings/TestSettingsSet.test
@@ -3,6 +3,11 @@
 # Check that setting an empty value with -f(orce) clears the value.
 # RUN: not %lldb -b -s %s 2>&1 | FileCheck %s
 
+# Make sure that "settings set -g" no longer requires a bogus filename.
+settings set -g target.skip-prologue false
+settings show target.skip-prologue
+# CHECK: target.skip-prologue (boolean) = false
+
 settings set tab-size 16
 settings show tab-size
 # CHECK: tab-size (unsigned) = 16
diff --git a/lldb/unittests/Core/MangledTest.cpp b/lldb/unittests/Core/MangledTest.cpp
--- a/lldb/unittests/Core/MangledTest.cpp
+++ b/lldb/unittests/Core/MangledTest.cpp
@@ -89,6 +89,25 @@
   EXPECT_STREQ("", the_demangled.GetCString());
 }
 
+TEST(MangledTest, BoolConversionOperator) {
+  {
+    ConstString MangledName("_ZN1a1b1cIiiiEEvm");
+    Mangled TheMangled(MangledName);
+    EXPECT_EQ(true, bool(TheMangled));
+    EXPECT_EQ(false, !TheMangled);
+  }
+  {
+    ConstString UnmangledName("puts");
+    Mangled TheMangled(UnmangledName);
+    EXPECT_EQ(true, bool(TheMangled));
+    EXPECT_EQ(false, !TheMangled);
+  }
+  {
+    Mangled TheMangled{};
+    EXPECT_EQ(false, bool(TheMangled));
+    EXPECT_EQ(true, !TheMangled);
+  }
+}
 
 TEST(MangledTest, NameIndexes_FindFunctionSymbols) {
   SubsystemRAII<FileSystem, HostInfo, ObjectFileELF, SymbolFileSymtab>
diff --git a/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt b/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt
--- a/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt
+++ b/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_lldb_unittest(SymbolFileDWARFTests
   DWARFASTParserClangTests.cpp
   DWARFDIETest.cpp
+  DWARFIndexCachingTest.cpp
   DWARFUnitTest.cpp
   SymbolFileDWARFTests.cpp
   XcodeSDKModuleTests.cpp
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFIndexCachingTest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFIndexCachingTest.cpp
new file mode 100644
--- /dev/null
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFIndexCachingTest.cpp
@@ -0,0 +1,198 @@
+//===-- DWARFIndexCachingTest.cpp -------------------------------------=---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Plugins/SymbolFile/DWARF/DIERef.h"
+#include "Plugins/SymbolFile/DWARF/DWARFDIE.h"
+#include "Plugins/SymbolFile/DWARF/ManualDWARFIndex.h"
+#include "Plugins/SymbolFile/DWARF/NameToDIE.h"
+#include "TestingSupport/Symbol/YAMLModuleTester.h"
+#include "lldb/Core/DataFileCache.h"
+#include "lldb/Core/ModuleList.h"
+#include "lldb/Utility/DataEncoder.h"
+#include "lldb/Utility/DataExtractor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+static void EncodeDecode(const DIERef &object, ByteOrder byte_order) {
+  const uint8_t addr_size = 8;
+  DataEncoder encoder(byte_order, addr_size);
+  object.Encode(encoder);
+  llvm::ArrayRef<uint8_t> bytes = encoder.GetData();
+  DataExtractor data(bytes.data(), bytes.size(), byte_order, addr_size);
+  offset_t data_offset = 0;
+  EXPECT_EQ(object, DIERef::Decode(data, &data_offset));
+}
+
+static void EncodeDecode(const DIERef &object) {
+  EncodeDecode(object, eByteOrderLittle);
+  EncodeDecode(object, eByteOrderBig);
+}
+
+TEST(DWARFIndexCachingTest, DIERefEncodeDecode) {
+  // Tests DIERef::Encode(...) and DIERef::Decode(...)
+  EncodeDecode(DIERef(llvm::None, DIERef::Section::DebugInfo, 0x11223344));
+  EncodeDecode(DIERef(llvm::None, DIERef::Section::DebugTypes, 0x11223344));
+  EncodeDecode(DIERef(100, DIERef::Section::DebugInfo, 0x11223344));
+  EncodeDecode(DIERef(200, DIERef::Section::DebugTypes, 0x11223344));
+}
+
+static void EncodeDecode(const NameToDIE &object, ByteOrder byte_order) {
+  const uint8_t addr_size = 8;
+  DataEncoder encoder(byte_order, addr_size);
+  DataEncoder strtab_encoder(byte_order, addr_size);
+  ConstStringTable const_strtab;
+
+  object.Encode(encoder, const_strtab);
+
+  llvm::ArrayRef<uint8_t> bytes = encoder.GetData();
+  DataExtractor data(bytes.data(), bytes.size(), byte_order, addr_size);
+
+  const_strtab.Encode(strtab_encoder);
+  llvm::ArrayRef<uint8_t> strtab_bytes = strtab_encoder.GetData();
+  DataExtractor strtab_data(strtab_bytes.data(), strtab_bytes.size(),
+                            byte_order, addr_size);
+  StringTableReader strtab_reader;
+  offset_t strtab_data_offset = 0;
+  ASSERT_EQ(strtab_reader.Decode(strtab_data, &strtab_data_offset), true);
+
+  NameToDIE decoded_object;
+  offset_t data_offset = 0;
+  decoded_object.Decode(data, &data_offset, strtab_reader);
+  EXPECT_TRUE(object == decoded_object);
+}
+
+static void EncodeDecode(const NameToDIE &object) {
+  EncodeDecode(object, eByteOrderLittle);
+  EncodeDecode(object, eByteOrderBig);
+}
+
+TEST(DWARFIndexCachingTest, NameToDIEEncodeDecode) {
+  NameToDIE map;
+  // Make sure an empty NameToDIE map encodes and decodes correctly.
+  EncodeDecode(map);
+  map.Insert(ConstString("hello"),
+             DIERef(llvm::None, DIERef::Section::DebugInfo, 0x11223344));
+  map.Insert(ConstString("workd"),
+             DIERef(100, DIERef::Section::DebugInfo, 0x11223344));
+  // Make sure a valid NameToDIE map encodes and decodes correctly.
+  EncodeDecode(map);
+}
+
+static void EncodeDecode(const ManualDWARFIndex::IndexSet &object,
+                         ByteOrder byte_order) {
+  const uint8_t addr_size = 8;
+  DataEncoder encoder(byte_order, addr_size);
+  DataEncoder strtab_encoder(byte_order, addr_size);
+  object.Encode(encoder);
+  llvm::ArrayRef<uint8_t> bytes = encoder.GetData();
+  DataExtractor data(bytes.data(), bytes.size(), byte_order, addr_size);
+  ManualDWARFIndex::IndexSet decoded_object;
+  offset_t data_offset = 0;
+  decoded_object.Decode(data, &data_offset);
+  EXPECT_TRUE(object == decoded_object);
+}
+
+static void EncodeDecode(const ManualDWARFIndex::IndexSet &object) {
+  EncodeDecode(object, eByteOrderLittle);
+  EncodeDecode(object, eByteOrderBig);
+}
+
+TEST(DWARFIndexCachingTest, ManualDWARFIndexIndexSetEncodeDecode) {
+  ManualDWARFIndex::IndexSet set;
+  // Make sure empty IndexSet can be encoded and decoded correctly
+  EncodeDecode(set);
+
+  dw_offset_t die_offset = 0;
+  // Make sure an IndexSet with only items in IndexSet::function_basenames can
+  // be encoded and decoded correctly.
+  set.function_basenames.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+  set.function_basenames.Clear();
+  // Make sure an IndexSet with only items in IndexSet::function_fullnames can
+  // be encoded and decoded correctly.
+  set.function_fullnames.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+  set.function_fullnames.Clear();
+  // Make sure an IndexSet with only items in IndexSet::function_methods can
+  // be encoded and decoded correctly.
+  set.function_methods.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+  set.function_methods.Clear();
+  // Make sure an IndexSet with only items in IndexSet::function_selectors can
+  // be encoded and decoded correctly.
+  set.function_selectors.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+  set.function_selectors.Clear();
+  // Make sure an IndexSet with only items in IndexSet::objc_class_selectors can
+  // be encoded and decoded correctly.
+  set.objc_class_selectors.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+  set.objc_class_selectors.Clear();
+  // Make sure an IndexSet with only items in IndexSet::globals can
+  // be encoded and decoded correctly.
+  set.globals.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+  set.globals.Clear();
+  // Make sure an IndexSet with only items in IndexSet::types can
+  // be encoded and decoded correctly.
+  set.types.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+  set.types.Clear();
+  // Make sure an IndexSet with only items in IndexSet::namespaces can
+  // be encoded and decoded correctly.
+  set.namespaces.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+  set.namespaces.Clear();
+  // Make sure that an IndexSet with item in all NameToDIE maps can be
+  // be encoded and decoded correctly.
+  set.function_basenames.Insert(
+      ConstString("a"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  set.function_fullnames.Insert(
+      ConstString("b"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  set.function_methods.Insert(
+      ConstString("c"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  set.function_selectors.Insert(
+      ConstString("d"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  set.objc_class_selectors.Insert(
+      ConstString("e"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  set.globals.Insert(
+      ConstString("f"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  set.types.Insert(
+      ConstString("g"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  set.namespaces.Insert(
+      ConstString("h"),
+      DIERef(llvm::None, DIERef::Section::DebugInfo, ++die_offset));
+  EncodeDecode(set);
+}
diff --git a/llvm/bindings/go/llvm/dibuilder.go b/llvm/bindings/go/llvm/dibuilder.go
--- a/llvm/bindings/go/llvm/dibuilder.go
+++ b/llvm/bindings/go/llvm/dibuilder.go
@@ -563,10 +563,10 @@
 
 // CreateExpression creates a new descriptor for the specified
 // variable which has a complex address expression for its address.
-func (d *DIBuilder) CreateExpression(addr []int64) Metadata {
-	var data *C.int64_t
+func (d *DIBuilder) CreateExpression(addr []uint64) Metadata {
+	var data *C.uint64_t
 	if len(addr) > 0 {
-		data = (*C.int64_t)(unsafe.Pointer(&addr[0]))
+		data = (*C.uint64_t)(unsafe.Pointer(&addr[0]))
 	}
 	result := C.LLVMDIBuilderCreateExpression(d.ref, data, C.size_t(len(addr)))
 	return Metadata{C: result}
diff --git a/llvm/bindings/ocaml/debuginfo/debuginfo_ocaml.c b/llvm/bindings/ocaml/debuginfo/debuginfo_ocaml.c
--- a/llvm/bindings/ocaml/debuginfo/debuginfo_ocaml.c
+++ b/llvm/bindings/ocaml/debuginfo/debuginfo_ocaml.c
@@ -865,7 +865,7 @@
 LLVMMetadataRef llvm_dibuild_create_constant_value_expression(value Builder,
                                                               value Value) {
   return LLVMDIBuilderCreateConstantValueExpression(DIBuilder_val(Builder),
-                                                    (int64_t)Int_val(Value));
+                                                    (uint64_t)Int_val(Value));
 }
 
 LLVMMetadataRef llvm_dibuild_create_global_variable_expression_native(
diff --git a/llvm/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.md b/llvm/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.md
--- a/llvm/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.md
+++ b/llvm/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.md
@@ -1,33 +1,88 @@
 # Allow Location Descriptions on the DWARF Expression Stack <!-- omit in toc -->
 
-- [Extension](#extension)
-- [Heterogeneous Computing Devices](#heterogeneous-computing-devices)
-- [DWARF 5](#dwarf-5)
-  - [How DWARF Maps Source Language To Hardware](#how-dwarf-maps-source-language-to-hardware)
-  - [Examples](#examples)
-    - [Dynamic Array Size](#dynamic-array-size)
-    - [Variable Location in Register](#variable-location-in-register)
-    - [Variable Location in Memory](#variable-location-in-memory)
-    - [Variable Spread Across Different Locations](#variable-spread-across-different-locations)
-    - [Offsetting a Composite Location](#offsetting-a-composite-location)
-  - [Limitations](#limitations)
-- [Extension Solution](#extension-solution)
-  - [Location Description](#location-description)
-  - [Stack Location Description Operations](#stack-location-description-operations)
-  - [Examples](#examples-1)
-    - [Source Language Variable Spilled to Part of a Vector Register](#source-language-variable-spilled-to-part-of-a-vector-register)
-    - [Source Language Variable Spread Across Multiple Vector Registers](#source-language-variable-spread-across-multiple-vector-registers)
-    - [Source Language Variable Spread Across Multiple Kinds of Locations](#source-language-variable-spread-across-multiple-kinds-of-locations)
-    - [Address Spaces](#address-spaces)
-    - [Bit Offsets](#bit-offsets)
-  - [Call Frame Information (CFI)](#call-frame-information-cfi)
-  - [Objects Not In Byte Aligned Global Memory](#objects-not-in-byte-aligned-global-memory)
-  - [Higher Order Operations](#higher-order-operations)
-  - [Objects In Multiple Places](#objects-in-multiple-places)
-- [Conclusion](#conclusion)
-- [Further Information](#further-information)
-
-# Extension
+- [1. Extension](#extension)
+- [2. Heterogeneous Computing Devices](#heterogeneous-computing-devices)
+- [3. DWARF 5](#dwarf-5)
+  - [3.1 How DWARF Maps Source Language To Hardware](#how-dwarf-maps-source-language-to-hardware)
+  - [3.2 Examples](#examples)
+    - [3.2.1 Dynamic Array Size](#dynamic-array-size)
+    - [3.2.2 Variable Location in Register](#variable-location-in-register)
+    - [3.2.3 Variable Location in Memory](#variable-location-in-memory)
+    - [3.2.4 Variable Spread Across Different Locations](#variable-spread-across-different-locations)
+    - [3.2.5 Offsetting a Composite Location](#offsetting-a-composite-location)
+  - [3.3 Limitations](#limitations)
+- [4. Extension Solution](#extension-solution)
+  - [4.1 Location Description](#location-description)
+  - [4.2 Stack Location Description Operations](#stack-location-description-operations)
+  - [4.3 Examples](#examples-1)
+    - [4.3.1 Source Language Variable Spilled to Part of a Vector Register](#source-language-variable-spilled-to-part-of-a-vector-register)
+    - [4.3.2 Source Language Variable Spread Across Multiple Vector Registers](#source-language-variable-spread-across-multiple-vector-registers)
+    - [4.3.3 Source Language Variable Spread Across Multiple Kinds of Locations](#source-language-variable-spread-across-multiple-kinds-of-locations)
+    - [4.3.4 Address Spaces](#address-spaces)
+    - [4.3.5 Bit Offsets](#bit-offsets)
+  - [4.4 Call Frame Information (CFI)](#call-frame-information-cfi)
+  - [4.5 Objects Not In Byte Aligned Global Memory](#objects-not-in-byte-aligned-global-memory)
+  - [4.6 Higher Order Operations](#higher-order-operations)
+  - [4.7 Objects In Multiple Places](#objects-in-multiple-places)
+- [5. Conclusion](#conclusion)
+- [A. Changes to DWARF Debugging Information Format Version 5](#a-changes-to-dwarf-debugging-information-format-version-5)
+  - [A.2 General Description](#a-2-general-description)
+    - [A.2.5 DWARF Expressions](#a-2-5-dwarf-expressions)
+      - [A.2.5.1 DWARF Expression Evaluation Context](#a-2-5-1-dwarf-expression-evaluation-context)
+      - [A.2.5.2 DWARF Expression Value](#a-2-5-2-dwarf-expression-value)
+      - [A.2.5.3 DWARF Location Description](#a-2-5-3-dwarf-location-description)
+      - [A.2.5.4 DWARF Operation Expressions](#a-2-5-4-dwarf-operation-expressions)
+        - [A.2.5.4.1 Stack Operations](#a-2-5-4-1-stack-operations)
+        - [A.2.5.4.2 Control Flow Operations](#a-2-5-4-2-control-flow-operations)
+        - [A.2.5.4.3 Value Operations](#a-2-5-4-3-value-operations)
+          - [A.2.5.4.3.1 Literal Operations](#a-2-5-4-3-1-literal-operations)
+          - [A.2.5.4.3.2 Arithmetic and Logical Operations](#a-2-5-4-3-2-arithmetic-and-logical-operations)
+          - [A.2.5.4.3.3 Type Conversion Operations](#a-2-5-4-3-3-type-conversion-operations)
+          - [A.2.5.4.3.4 Special Value Operations](#a-2-5-4-3-4-special-value-operations)
+        - [A.2.5.4.4 Location Description Operations](#a-2-5-4-4-location-description-operations)
+          - [A.2.5.4.4.1 General Location Description Operations](#a-2-5-4-4-1-general-location-description-operations)
+          - [A.2.5.4.4.2 Undefined Location Description Operations](#a-2-5-4-4-2-undefined-location-description-operations)
+          - [A.2.5.4.4.3 Memory Location Description Operations](#a-2-5-4-4-3-memory-location-description-operations)
+          - [A.2.5.4.4.4 Register Location Description Operations](#a-2-5-4-4-4-register-location-description-operations)
+          - [A.2.5.4.4.5 Implicit Location Description Operations](#a-2-5-4-4-5-implicit-location-description-operations)
+          - [A.2.5.4.4.6 Composite Location Description Operations](#a-2-5-4-4-6-composite-location-description-operations)
+      - [A.2.5.5 DWARF Location List Expressions](#a-2-5-5-dwarf-location-list-expressions)
+  - [A.3 Program Scope Entries](#a-3-program-scope-entries)
+    - [A.3.3 Subroutine and Entry Point Entries](#a-3-3-subroutine-and-entry-point-entries)
+      - [A.3.3.5 Low-Level Information](#a-3-3-5-low-level-information)
+    - [A.3.4 Call Site Entries and Parameters](#a-3-4-call-site-entries-and-parameters)
+      - [A.3.4.2 Call Site Parameters](#a-3-4-2-call-site-parameters)
+    - [A.3.5 Lexical Block Entries](#a-3-5-lexical-block-entries)
+  - [A.4 Data Object and Object List Entries](#a-4-data-object-and-object-list-entries)
+    - [A.4.1 Data Object Entries](#a-4-1-data-object-entries)
+  - [A.5 Type Entries](#a-5-type-entries)
+    - [A.5.7 Structure, Union, Class and Interface Type Entries](#a-5-7-structure-union-class-and-interface-type-entries)
+      - [A.5.7.3 Derived or Extended Structures, Classes and Interfaces](#a-5-7-3-derived-or-extended-structures-classes-and-interfaces)
+      - [A.5.7.8 Member Function Entries](#a-5-7-8-member-function-entries)
+    - [A.5.14 Pointer to Member Type Entries](#a-5-14-pointer-to-member-type-entries)
+    - [A.5.16 Dynamic Type Entries](#a-5-16-dynamic-type-entries)
+  - [A.6 Other Debugging Information](#a-6-other-debugging-information)
+    - [A.6.2 Line Number Information](#a-6-2-line-number-information)
+    - [A.6.4 Call Frame Information](#a-6-4-call-frame-information)
+      - [A.6.4.1 Structure of Call Frame Information](#a-6-4-1-structure-of-call-frame-information)
+      - [A.6.4.2 Call Frame Instructions](#a-6-4-2-call-frame-instructions)
+        - [A.6.4.2.1 Row Creation Instructions](#a-6-4-2-1-row-creation-instructions)
+        - [A.6.4.2.2 CFA Definition Instructions](#a-6-4-2-2-cfa-definition-instructions)
+        - [A.6.4.2.3 Register Rule Instructions](#a-6-4-2-3-register-rule-instructions)
+        - [A.6.4.2.4 Row State Instructions](#a-6-4-2-4-row-state-instructions)
+        - [A.6.4.2.5 Padding Instruction](#a-6-4-2-5-padding-instruction)
+      - [A.6.4.3 Call Frame Instruction Usage](#a-6-4-3-call-frame-instruction-usage)
+      - [A.6.4.4 Call Frame Calling Address](#a-6-4-4-call-frame-calling-address)
+  - [A.7 Data Representation](#a-7-data-representation)
+    - [A.7.4 32-Bit and 64-Bit DWARF Formats](#a-7-4-32-bit-and-64-bit-dwarf-formats)
+    - [A.7.5 Format of Debugging Information](#a-7-5-format-of-debugging-information)
+      - [A.7.5.5 Classes and Forms](#a-7-5-5-classes-and-forms)
+    - [A.7.7 DWARF Expressions](#a-7-7-dwarf-expressions)
+      - [A.7.7.1 Operation Expressions](#a-7-7-1-operation-expressions)
+      - [A.7.7.3 Location List Expressions](#a-7-7-3-location-list-expressions)
+- [B. Further Information](#b-further-information)
+
+# 1. Extension
 
 In DWARF 5, expressions are evaluated using a typed value stack, a separate
 location area, and an independent loclist mechanism. This extension unifies all
@@ -54,15 +109,22 @@
 consumers than a smaller number of general composable operations that have
 consistent semantics regardless of context.
 
-The following sections first describe heterogeneous devices and the features
-they have that are not addressed by DWARF 5. Then a brief simplified overview of
-the DWARF 5 expression evaluation model is presented that highlights the
-difficulties for supporting the heterogeneous features. Finally, an overview of
-the extension is presented, using simplified examples to illustrate how it can
-address the issues of heterogeneous devices and also benefit non-heterogeneous
-devices. References to further information are provided.
-
-# Heterogeneous Computing Devices
+First, section [2. Heterogeneous Computing
+Devices](#heterogeneous-computing-devices) describes heterogeneous devices and
+the features they have that are not addressed by DWARF 5. Then section [3. DWARF
+5](#dwarf-5) presents a brief simplified overview of the DWARF 5 expression
+evaluation model that highlights the difficulties for supporting the
+heterogeneous features. Next, section [4. Extension
+Solution](#extension-solution) provides an overview of the proposal, using
+simplified examples to illustrate how it can address the issues of heterogeneous
+devices and also benefit non-heterogeneous devices. Then overall conclusions are
+covered in section [5. Conclusion](#conclusion). Appendix [A. Changes to DWARF
+Debugging Information Format Version
+5](#a-changes-to-dwarf-debugging-information-format-version-5) gives changes
+relative to the DWARF Version 5 standard. Finally, appendix [B. Further
+Information](#b-further-information) has references to further information.
+
+# 2. Heterogeneous Computing Devices
 
 GPUs and other heterogeneous computing devices have features not common to CPU
 computing devices.
@@ -101,13 +163,13 @@
 there is a benefit to be able to factorize their calculation which requires all
 location kinds to be supported uniformly, otherwise duplication is necessary.
 
-# DWARF 5
+# 3. DWARF 5
 
 Before presenting the proposed solution to supporting heterogeneous devices, a
 brief overview of the DWARF 5 expression evaluation model will be given to
 highlight the aspects being addressed by the extension.
 
-## How DWARF Maps Source Language To Hardware
+## 3.1 How DWARF Maps Source Language To Hardware
 
 DWARF is a standardized way to specify debug information. It describes source
 language entities such as compilation units, functions, types, variables, etc.
@@ -158,13 +220,13 @@
   value or with the location of a base object that is available using the
   DW_OP_push_object_address operation.
 
-## Examples
+## 3.2 Examples
 
 The following examples illustrate how DWARF expressions involving operations are
 evaluated in DWARF 5. DWARF also has expressions involving location lists that
 are not covered in these examples.
 
-### Dynamic Array Size
+### 3.2.1 Dynamic Array Size
 
 The first example is for an operation expression associated with a DIE attribute
 that provides the number of elements in a dynamic array type. Such an attribute
@@ -211,7 +273,7 @@
 expression that is evaluated with a value result kind context is the top element
 of the stack, which provides the value and its type.
 
-### Variable Location in Register
+### 3.2.2 Variable Location in Register
 
 This example is for an operation expression associated with a DIE attribute that
 provides the location of a source language variable. Such an attribute dictates
@@ -244,7 +306,7 @@
 an expression that is evaluated with a location result kind context is the
 location description in the location area.
 
-### Variable Location in Memory
+### 3.2.3 Variable Location in Memory
 
 The next example is for an operation expression associated with a DIE attribute
 that provides the location of a source language variable that is allocated in a
@@ -285,7 +347,7 @@
 
 ![Variable Location in Memory Example: Step 4](images/03-memory.example.frame.4.png)
 
-### Variable Spread Across Different Locations
+### 3.2.4 Variable Spread Across Different Locations
 
 This example is for a source variable that is partly in a register, partly undefined, and partly in memory.
 
@@ -349,11 +411,12 @@
 
 ![Variable Spread Across Different Locations Example: Step 7](images/04-composite.example.frame.7.png)
 
-### Offsetting a Composite Location
+### 3.2.5 Offsetting a Composite Location
 
 This example attempts to extend the previous example to offset the composite
-location description it created. The *Variable Location in Memory* example
-conveniently used the DW_OP_plus operation to offset a memory address.
+location description it created. The [3.2.3 Variable Location in
+Memory](#variable-location-in-memory) example conveniently used the DW_OP_plus
+operation to offset a memory address.
 
     DW_OP_regx SGPR3
     DW_OP_piece 4
@@ -380,7 +443,7 @@
 This illustrates that operations on stack values are not composable with
 operations on location descriptions.
 
-## Limitations
+## 3.3 Limitations
 
 DWARF 5 is unable to describe variables in runtime indexed parts of registers.
 This is required to describe a source variable that is located in a lane of a
@@ -406,7 +469,7 @@
 Supporting them in a uniform manner for all location kinds is required to
 support languages with bit sized entities.
 
-# Extension Solution
+# 4. Extension Solution
 
 This section outlines the extension to generalize the DWARF expression evaluation
 model to allow location descriptions to be manipulated on the stack. It presents
@@ -414,7 +477,7 @@
 solves the issues of heterogeneous devices. It presents how this is done in
 a manner that is backwards compatible with DWARF 5.
 
-## Location Description
+## 4.1 Location Description
 
 In order to have consistent, composable operations that act on location
 descriptions, the extension defines a uniform way to handle all location kinds.
@@ -437,7 +500,7 @@
   undefined.
 - For composite, it is a linear stream of bytes defined by the composite's parts.
 
-## Stack Location Description Operations
+## 4.2 Stack Location Description Operations
 
 The DWARF expression stack is extended to allow each stack entry to either be a
 value or a location description.
@@ -475,12 +538,12 @@
 location description. Currently this is only possible as a piece of a composite
 when the stack is empty.
 
-## Examples
+## 4.3 Examples
 
 This section provides some motivating examples to illustrate the benefits that
 result from allowing location descriptions on the stack.
 
-### Source Language Variable Spilled to Part of a Vector Register
+### 4.3.1 Source Language Variable Spilled to Part of a Vector Register
 
 A compiler generating code for a GPU may allocate a source language variable
 that it proves has the same value for every lane of a SIMT thread in a scalar
@@ -522,7 +585,7 @@
 would also not permit a runtime index into part of the whole register to be used
 as shown in the next example.
 
-### Source Language Variable Spread Across Multiple Vector Registers
+### 4.3.2 Source Language Variable Spread Across Multiple Vector Registers
 
 A compiler may generate SIMT code for a GPU. Each source language thread of
 execution is mapped to a single lane of the GPU thread. Source language
@@ -623,7 +686,7 @@
 
 ![Source Language Variable Spread Across Multiple Vector Registers Example: Step 14](images/07-extension-multi-lane-vgpr.example.frame.14.png)
 
-### Source Language Variable Spread Across Multiple Kinds of Locations
+### 4.3.3 Source Language Variable Spread Across Multiple Kinds of Locations
 
 This example is the same as the previous one, except the first 2 bytes of the
 second vector register have been spilled to memory, and the last 2 bytes have
@@ -683,7 +746,7 @@
 
 ![Source Language Variable Spread Across Multiple Kinds of Locations Example: Step 12](images/08-extension-mixed-composite.example.frame.7.png)
 
-### Address Spaces
+### 4.3.4 Address Spaces
 
 Heterogeneous devices can have multiple hardware supported address spaces which
 use specific hardware instructions to access them.
@@ -752,7 +815,7 @@
 allocated in a register are spilled to a stack frame that resides in the
 non-global address space.
 
-### Bit Offsets
+### 4.3.5 Bit Offsets
 
 With the generalization of location descriptions on the stack, it is possible to
 define a DW_OP_bit_offset operation that adjusts the offset of any kind of
@@ -792,7 +855,7 @@
 architecture. A base type could be extended to specify bit ordering in addition
 to byte ordering.
 
-## Call Frame Information (CFI)
+## 4.4 Call Frame Information (CFI)
 
 DWARF defines call frame information (CFI) that can be used to virtually unwind
 the subprogram call stack. This involves determining the location where register
@@ -804,7 +867,7 @@
 Therefore, the extension extends the CFI rules to support any kind of location
 description, and operations to create locations in address spaces.
 
-## Objects Not In Byte Aligned Global Memory
+## 4.5 Objects Not In Byte Aligned Global Memory
 
 DWARF 5 only effectively supports byte aligned memory locations on the stack by
 using a global memory address as a proxy for a memory location description. This
@@ -842,7 +905,7 @@
 Full general support for bit fields and implicit locations benefits
 optimizations on any target.
 
-## Higher Order Operations
+## 4.6 Higher Order Operations
 
 The generalization allows an elegant way to add higher order operations that
 create location descriptions out of other location descriptions in a general
@@ -876,7 +939,7 @@
 promoted into a vector register when executing a set of iterations of a loop in
 a SIMD manner.
 
-## Objects In Multiple Places
+## 4.7 Objects In Multiple Places
 
 A compiler may allocate a source variable in stack frame memory, but for some
 range of code may promote it to a register. If the generated code does not
@@ -922,7 +985,7 @@
 descriptions, the consumer can ensure any updates are done to all of them, and
 any reads can use any one of them.
 
-# Conclusion
+# 5. Conclusion
 
 A strength of DWARF is that it has generally sought to provide generalized
 composable solutions that address many problems, rather than solutions that only
@@ -932,17 +995,2705 @@
 heterogeneous computing devices, provides benefits for non-heterogeneous
 devices, and can help address a number of other previously reported issues.
 
-# Further Information
+# A. Changes to DWARF Debugging Information Format Version 5
+
+> NOTE: This appendix provides changes relative to DWARF Version 5. It has been
+> defined such that it is backwards compatible with DWARF Version 5.
+> Non-normative text is shown in <i>italics</i>. The section numbers generally
+> correspond to those in the DWARF Version 5 standard unless specified
+> otherwise. Definitions are given to clarify how existing expression
+> operations, CFI operations, and attributes behave with respect to generalized
+> location descriptions that support multiple places.
+>
+> > NOTE: Notes are included to describe how the changes are to be applied to
+> > the DWARF Version 5 standard. They also describe rational and issues that
+> > may need further consideration.
+
+## A.2 General Description
+
+### A.2.5 DWARF Expressions
+
+> NOTE: This section, and its nested sections, replaces DWARF Version 5 section
+> 2.5 and section 2.6. It is based on the text of the existing DWARF Version 5
+> standard.
+
+DWARF expressions describe how to compute a value or specify a location.
+
+<i>The evaluation of a DWARF expression can provide the location of an object,
+the value of an array bound, the length of a dynamic string, the desired value
+itself, and so on.</i>
+
+If the evaluation of a DWARF expression does not encounter an error, then it can
+either result in a value (see [2.5.2 DWARF Expression
+Value](#dwarf-expression-value)) or a location description (see [2.5.3 DWARF
+Location Description](#dwarf-location-description)). When a DWARF expression
+is evaluated, it may be specified whether a value or location description is
+required as the result kind.
+
+If a result kind is specified, and the result of the evaluation does not match
+the specified result kind, then the implicit conversions described in [2.5.4.4.3
+Memory Location Description
+Operations](#memory-location-description-operations) are performed if
+valid. Otherwise, the DWARF expression is ill-formed.
+
+If the evaluation of a DWARF expression encounters an evaluation error, then the
+result is an evaluation error.
+
+> NOTE: Decided to define the concept of an evaluation error. An alternative is
+> to introduce an undefined value base type in a similar way to location
+> descriptions having an undefined location description. Then operations that
+> encounter an evaluation error can return the undefined location description or
+> value with an undefined base type.
+>
+> All operations that act on values would return an undefined entity if given an
+> undefined value. The expression would then always evaluate to completion, and
+> can be tested to determine if it is an undefined entity.
+>
+> However, this would add considerable additional complexity and does not match
+> that GDB throws an exception when these evaluation errors occur.
+
+If a DWARF expression is ill-formed, then the result is undefined.
+
+The following sections detail the rules for when a DWARF expression is
+ill-formed or results in an evaluation error.
+
+A DWARF expression can either be encoded as an operation expression (see [2.5.4
+DWARF Operation Expressions](#dwarf-operation-expressions)), or as a
+location list expression (see [2.5.5 DWARF Location List
+Expressions](#dwarf-location-list-expressions)).
+
+#### A.2.5.1 DWARF Expression Evaluation Context
+
+A DWARF expression is evaluated in a context that can include a number of
+context elements. If multiple context elements are specified then they must be
+self consistent or the result of the evaluation is undefined. The context
+elements that can be specified are:
+
+1.  <i>A current result kind</i>
+
+    The kind of result required by the DWARF expression evaluation. If specified
+    it can be a location description or a value.
+
+2.  <i>A current thread</i>
+
+    The target architecture thread identifier of the source program thread of
+    execution for which a user presented expression is currently being
+    evaluated.
+
+    It is required for operations that are related to target architecture
+    threads.
+
+    <i>For example, the `DW_OP_regval_type` operation.</i>
+
+3.  <i>A current call frame</i>
+
+    The target architecture call frame identifier. It identifies a call frame
+    that corresponds to an active invocation of a subprogram in the current
+    thread. It is identified by its address on the call stack. The address is
+    referred to as the Canonical Frame Address (CFA). The call frame information
+    is used to determine the CFA for the call frames of the current thread's
+    call stack (see [6.4 Call Frame Information](#call-frame-information)).
+
+    It is required for operations that specify target architecture registers to
+    support virtual unwinding of the call stack.
+
+    <i>For example, the `DW_OP_*reg*` operations.</i>
+
+    If specified, it must be an active call frame in the current thread.
+    Otherwise the result is undefined.
+
+    If it is the currently executing call frame, then it is termed the top call
+    frame.
+
+4.  <i>A current program location</i>
+
+    The target architecture program location corresponding to the current call
+    frame of the current thread.
+
+    The program location of the top call frame is the target architecture
+    program counter for the current thread. The call frame information is used
+    to obtain the value of the return address register to determine the program
+    location of the other call frames (see [6.4 Call Frame
+    Information](#call-frame-information)).
+
+    It is required for the evaluation of location list expressions to select
+    amongst multiple program location ranges. It is required for operations that
+    specify target architecture registers to support virtual unwinding of the
+    call stack (see [6.4 Call Frame Information](#call-frame-information)).
+
+    If specified:
+
+    - If the current call frame is the top call frame, it must be the current
+      target architecture program location.
+    - If the current call frame F is not the top call frame, it must be the
+      program location associated with the call site in the current caller frame
+      F that invoked the callee frame.
+    - Otherwise the result is undefined.
+
+5.  <i>A current compilation unit</i>
+
+    The compilation unit debug information entry that contains the DWARF
+    expression being evaluated.
+
+    It is required for operations that reference debug information associated
+    with the same compilation unit, including indicating if such references use
+    the 32-bit or 64-bit DWARF format. It can also provide the default address
+    space address size if no current target architecture is specified.
+
+    <i>For example, the `DW_OP_constx` and `DW_OP_addrx` operations.</i>
+
+    <i>Note that this compilation unit may not be the same as the compilation
+    unit determined from the loaded code object corresponding to the current
+    program location. For example, the evaluation of the expression E associated
+    with a `DW_AT_location` attribute of the debug information entry operand of
+    the `DW_OP_call*` operations is evaluated with the compilation unit that
+    contains E and not the one that contains the `DW_OP_call*` operation
+    expression.</i>
+
+6.  <i>A current target architecture</i>
+
+    The target architecture.
+
+    It is required for operations that specify target architecture specific
+    entities.
+
+    <i>For example, target architecture specific entities include DWARF register
+    identifiers, DWARF address space identifiers, the default address space, and
+    the address space address sizes.</i>
+
+    If specified:
+
+    - If the current thread is specified, then the current target architecture
+      must be the same as the target architecture of the current thread.
+    - If the current compilation unit is specified, then the current target
+      architecture default address space address size must be the same as the
+      `address_size` field in the header of the current compilation unit and any
+      associated entry in the `.debug_aranges` section.
+    - If the current program location is specified, then the current target
+      architecture must be the same as the target architecture of any line
+      number information entry (see [6.2 Line Number
+      Information](#line-number-information)) corresponding to the current
+      program location.
+    - If the current program location is specified, then the current target
+      architecture default address space address size must be the same as the
+      `address_size` field in the header of any entry corresponding to the
+      current program location in the `.debug_addr`, `.debug_line`,
+      `.debug_rnglists`, `.debug_rnglists.dwo`, `.debug_loclists`, and
+      `.debug_loclists.dwo` sections.
+    - Otherwise the result is undefined.
+
+7.  <i>A current object</i>
+
+    The location description of a program object.
+
+    It is required for the `DW_OP_push_object_address` operation.
+
+    <i>For example, the `DW_AT_data_location` attribute on type debug
+    information entries specifies the program object corresponding to a runtime
+    descriptor as the current object when it evaluates its associated
+    expression.</i>
+
+    The result is undefined if the location descriptor is invalid (see [3.5.3
+    DWARF Location Description](#dwarf-location-description)).
+
+8.  <i>An initial stack</i>
+
+    This is a list of values or location descriptions that will be pushed on the
+    operation expression evaluation stack in the order provided before
+    evaluation of an operation expression starts.
+
+    Some debugger information entries have attributes that evaluate their DWARF
+    expression value with initial stack entries. In all other cases the initial
+    stack is empty.
+
+    The result is undefined if any location descriptors are invalid (see [3.5.3
+    DWARF Location Description](#dwarf-location-description)).
+
+If the evaluation requires a context element that is not specified, then the
+result of the evaluation is an error.
+
+<i>A DWARF expression for a location description may be able to be evaluated
+without a thread, call frame, program location, or architecture context. For
+example, the location of a global variable may be able to be evaluated without
+such context. If the expression evaluates with an error then it may indicate the
+variable has been optimized and so requires more context.</i>
+
+<i>The DWARF expression for call frame information (see [6.4 Call Frame
+Information](#call-frame-information)) operations are restricted to those
+that do not require the compilation unit context to be specified.</i>
+
+The DWARF is ill-formed if all the `address_size` fields in the headers of all
+the entries in the `.debug_info`, `.debug_addr`, `.debug_line`,
+`.debug_rnglists`, `.debug_rnglists.dwo`, `.debug_loclists`, and
+`.debug_loclists.dwo` sections corresponding to any given program location do
+not match.
+
+#### A.2.5.2 DWARF Expression Value
+
+A value has a type and a literal value. It can represent a literal value of any
+supported base type of the target architecture. The base type specifies the
+size, encoding, and endianity of the literal value.
+
+> NOTE: It may be desirable to add an implicit pointer base type encoding. It
+> would be used for the type of the value that is produced when the
+> `DW_OP_deref*` operation retrieves the full contents of an implicit pointer
+> location storage created by the `DW_OP_implicit_pointer` operation. The
+> literal value would record the debugging information entry and byte
+> displacement specified by the associated `DW_OP_implicit_pointer` operation.
+
+There is a distinguished base type termed the generic type, which is an integral
+type that has the size of an address in the target architecture default address
+space, a target architecture defined endianity, and unspecified signedness.
+
+<i>The generic type is the same as the unspecified type used for stack
+operations defined in DWARF Version 4 and before.</i>
+
+An integral type is a base type that has an encoding of `DW_ATE_signed`,
+`DW_ATE_signed_char`, `DW_ATE_unsigned`, `DW_ATE_unsigned_char`,
+`DW_ATE_boolean`, or any target architecture defined integral encoding in the
+inclusive range `DW_ATE_lo_user` to `DW_ATE_hi_user`.
+
+> NOTE: It is unclear if `DW_ATE_address` is an integral type. GDB does not seem
+> to consider it as integral.
+
+#### A.2.5.3 DWARF Location Description
+
+<i>Debugging information must provide consumers a way to find the location of
+program variables, determine the bounds of dynamic arrays and strings, and
+possibly to find the base address of a subprogram's call frame or the return
+address of a subprogram. Furthermore, to meet the needs of recent computer
+architectures and optimization techniques, debugging information must be able to
+describe the location of an object whose location changes over the object's
+lifetime, and may reside at multiple locations simultaneously during parts of an
+object's lifetime.</i>
+
+Information about the location of program objects is provided by location
+descriptions.
+
+Location descriptions can consist of one or more single location descriptions.
+
+A single location description specifies the location storage that holds a
+program object and a position within the location storage where the program
+object starts. The position within the location storage is expressed as a bit
+offset relative to the start of the location storage.
+
+A location storage is a linear stream of bits that can hold values. Each
+location storage has a size in bits and can be accessed using a zero-based bit
+offset. The ordering of bits within a location storage uses the bit numbering
+and direction conventions that are appropriate to the current language on the
+target architecture.
+
+There are five kinds of location storage:
+
+1.  <i>memory location storage</i>
+
+    Corresponds to the target architecture memory address spaces.
+
+2.  <i>register location storage</i>
+
+    Corresponds to the target architecture registers.
+
+3.  <i>implicit location storage</i>
+
+    Corresponds to fixed values that can only be read.
+
+4.  <i>undefined location storage</i>
+
+    Indicates no value is available and therefore cannot be read or written.
+
+5.  <i>composite location storage</i>
+
+    Allows a mixture of these where some bits come from one location storage and
+    some from another location storage, or from disjoint parts of the same
+    location storage.
+
+> NOTE: It may be better to add an implicit pointer location storage kind used
+> by the `DW_OP_implicit_pointer` operation. It would specify the debugger
+> information entry and byte offset provided by the operations.
+
+<i>Location descriptions are a language independent representation of addressing
+rules.</i>
+
+- <i>They can be the result of evaluating a debugger information entry attribute
+  that specifies an operation expression of arbitrary complexity. In this usage
+  they can describe the location of an object as long as its lifetime is either
+  static or the same as the lexical block (see [3.5 Lexical Block
+  Entries](#lexical-block-entries)) that owns it, and it does not move during
+  its lifetime.</i>
+
+- <i>They can be the result of evaluating a debugger information entry attribute
+  that specifies a location list expression. In this usage they can describe the
+  location of an object that has a limited lifetime, changes its location during
+  its lifetime, or has multiple locations over part or all of its lifetime.</i>
+
+If a location description has more than one single location description, the
+DWARF expression is ill-formed if the object value held in each single location
+description's position within the associated location storage is not the same
+value, except for the parts of the value that are uninitialized.
+
+<i>A location description that has more than one single location description can
+only be created by a location list expression that has overlapping program
+location ranges, or certain expression operations that act on a location
+description that has more than one single location description. There are no
+operation expression operations that can directly create a location description
+with more than one single location description.</i>
+
+<i>A location description with more than one single location description can be
+used to describe objects that reside in more than one piece of storage at the
+same time. An object may have more than one location as a result of
+optimization. For example, a value that is only read may be promoted from memory
+to a register for some region of code, but later code may revert to reading the
+value from memory as the register may be used for other purposes. For the code
+region where the value is in a register, any change to the object value must be
+made in both the register and the memory so both regions of code will read the
+updated value.</i>
+
+<i>A consumer of a location description with more than one single location
+description can read the object's value from any of the single location
+descriptions (since they all refer to location storage that has the same value),
+but must write any changed value to all the single location descriptions.</i>
+
+Updating a location description L by a bit offset B is defined as adding the
+value of B to the bit offset of each single location description SL of L. It is
+an evaluation error if the updated bit offset of any SL is less than 0 or
+greater than or equal to the size of the location storage specified by SL.
+
+The evaluation of an expression may require context elements to create a
+location description. If such a location description is accessed, the storage it
+denotes is that associated with the context element values specified when the
+location description was created, which may differ from the context at the time
+it is accessed.
+
+<i>For example, creating a register location description requires the thread
+context: the location storage is for the specified register of that thread.
+Creating a memory location description for an address space may required a
+thread context: the location storage is the memory associated with that
+thread.</i>
+
+If any of the context elements required to create a location description change,
+the location description becomes invalid and accessing it is undefined.
+
+<i>Examples of context that can invalidate a location description are:</i>
+
+- <i>The thread context is required and execution causes the thread to
+  terminate.</i>
+- <i>The call frame context is required and further execution causes the call
+  frame to return to the calling frame.</i>
+- <i>The program location is required and further execution of the thread
+  occurs. That could change the location list entry or call frame information
+  entry that applies.</i>
+- <i>An operation uses call frame information:</i>
+  - <i>Any of the frames used in the virtual call frame unwinding return.</i>
+  - <i>The top call frame is used, the program location is used to select the
+    call frame information entry, and further execution of the thread
+    occurs.</i>
+
+<i>A DWARF expression can be used to compute a location description for an
+object. A subsequent DWARF expression evaluation can be given the object
+location description as the object context or initial stack context to compute a
+component of the object. The final result is undefined if the object location
+description becomes invalid between the two expression evaluations.</i>
+
+A change of a thread's program location may not make a location description
+invalid, yet may still render it as no longer meaningful. Accessing such a
+location description, or using it as the object context or initial stack context
+of an expression evaluation, may produce an undefined result.
+
+<i>For example, a location description may specify a register that no longer
+holds the intended program object after a program location change. One way to
+avoid such problems is to recompute location descriptions associated with
+threads when their program locations change.</i>
+
+#### A.2.5.4 DWARF Operation Expressions
+
+An operation expression is comprised of a stream of operations, each consisting
+of an opcode followed by zero or more operands. The number of operands is
+implied by the opcode.
+
+Operations represent a postfix operation on a simple stack machine. Each stack
+entry can hold either a value or a location description. Operations can act on
+entries on the stack, including adding entries and removing entries. If the kind
+of a stack entry does not match the kind required by the operation and is not
+implicitly convertible to the required kind (see [2.5.4.4.3 Memory Location
+Description Operations](#memory-location-description-operations)), then
+the DWARF operation expression is ill-formed.
+
+Evaluation of an operation expression starts with an empty stack on which the
+entries from the initial stack provided by the context are pushed in the order
+provided. Then the operations are evaluated, starting with the first operation
+of the stream. Evaluation continues until either an operation has an evaluation
+error, or until one past the last operation of the stream is reached.
+
+The result of the evaluation is:
+
+- If an operation has an evaluation error, or an operation evaluates an
+  expression that has an evaluation error, then the result is an evaluation
+  error.
+- If the current result kind specifies a location description, then:
+  - If the stack is empty, the result is a location description with one
+    undefined location description.
+
+    <i>This rule is for backwards compatibility with DWARF Version 5 which uses
+    an empty operation expression for this purpose.</i>
+
+  - If the top stack entry is a location description, or can be converted to one
+    (see [2.5.4.4.3 Memory Location Description
+    Operations](#memory-location-description-operations)), then the result
+    is that, possibly converted, location description. Any other entries on the
+    stack are discarded.
+  - Otherwise the DWARF expression is ill-formed.
+
+    > NOTE: Could define this case as returning an implicit location description
+    > as if the `DW_OP_implicit` operation is performed.
+
+- If the current result kind specifies a value, then:
+  - If the top stack entry is a value, or can be converted to one (see
+    [2.5.4.4.3 Memory Location Description
+    Operations](#memory-location-description-operations)), then the result is
+    that, possibly converted, value. Any other entries on the stack are
+    discarded.
+  - Otherwise the DWARF expression is ill-formed.
+- If the current result kind is not specified, then:
+  - If the stack is empty, the result is a location description with one
+    undefined location description.
+
+    <i>This rule is for backwards compatibility with DWARF Version 5 which uses
+    an empty operation expression for this purpose.</i>
+
+    > NOTE: This rule is consistent with the rule above for when a location
+    > description is requested. However, GDB appears to report this as an error
+    > and no GDB tests appear to cause an empty stack for this case.
+
+  - Otherwise, the top stack entry is returned. Any other entries on the stack
+    are discarded.
+
+An operation expression is encoded as a byte block with some form of prefix that
+specifies the byte count. It can be used:
+
+- as the value of a debugging information entry attribute that is encoded using
+  class `exprloc` (see [7.5.5 Classes and Forms](#classes-and-forms)),
+- as the operand to certain operation expression operations,
+- as the operand to certain call frame information operations (see [6.4 Call
+  Frame Information](#call-frame-information)),
+- and in location list entries (see [2.5.5 DWARF Location List
+  Expressions](#dwarf-location-list-expressions)).
+
+##### A.2.5.4.1 Stack Operations
+
+> NOTE: This section replaces DWARF Version 5 section 2.5.1.3.
+
+The following operations manipulate the DWARF stack. Operations that index the
+stack assume that the top of the stack (most recently added entry) has index 0.
+They allow the stack entries to be either a value or location description.
+
+If any stack entry accessed by a stack operation is an incomplete composite
+location description (see [2.5.4.4.6 Composite Location Description
+Operations](#composite-location-description-operations)), then the DWARF
+expression is ill-formed.
+
+> NOTE: These operations now support stack entries that are values and location
+> descriptions.
+
+> NOTE: If it is desired to also make them work with incomplete composite
+> location descriptions, then would need to define that the composite location
+> storage specified by the incomplete composite location description is also
+> replicated when a copy is pushed. This ensures that each copy of the
+> incomplete composite location description can update the composite location
+> storage they specify independently.
+
+1.  `DW_OP_dup`
+
+    `DW_OP_dup` duplicates the stack entry at the top of the stack.
+
+2.  `DW_OP_drop`
+
+    `DW_OP_drop` pops the stack entry at the top of the stack and discards it.
+
+3.  `DW_OP_pick`
+
+    `DW_OP_pick` has a single unsigned 1-byte operand that represents an index
+    I.  A copy of the stack entry with index I is pushed onto the stack.
+
+4.  `DW_OP_over`
+
+    `DW_OP_over` pushes a copy of the entry with index 1.
+
+    <i>This is equivalent to a `DW_OP_pick 1` operation.</i>
+
+5.  `DW_OP_swap`
+
+    `DW_OP_swap` swaps the top two stack entries. The entry at the top of the
+    stack becomes the second stack entry, and the second stack entry becomes the
+    top of the stack.
+
+6.  `DW_OP_rot`
+
+    `DW_OP_rot` rotates the first three stack entries. The entry at the top of
+    the stack becomes the third stack entry, the second entry becomes the top of
+    the stack, and the third entry becomes the second entry.
+
+##### A.2.5.4.2 Control Flow Operations
+
+> NOTE: This section replaces DWARF Version 5 section 2.5.1.5.
+
+The following operations provide simple control of the flow of a DWARF operation
+expression.
+
+1.  `DW_OP_nop`
+
+    `DW_OP_nop` is a place holder. It has no effect on the DWARF stack entries.
+
+2.  `DW_OP_le`, `DW_OP_ge`, `DW_OP_eq`, `DW_OP_lt`, `DW_OP_gt`,
+    `DW_OP_ne`
+
+    > NOTE: The same as in DWARF Version 5 section 2.5.1.5.
+
+3.  `DW_OP_skip`
+
+    `DW_OP_skip` is an unconditional branch. Its single operand is a 2-byte
+    signed integer constant. The 2-byte constant is the number of bytes of the
+    DWARF expression to skip forward or backward from the current operation,
+    beginning after the 2-byte constant.
+
+    If the updated position is at one past the end of the last operation, then
+    the operation expression evaluation is complete.
+
+    Otherwise, the DWARF expression is ill-formed if the updated operation
+    position is not in the range of the first to last operation inclusive, or
+    not at the start of an operation.
+
+4.  `DW_OP_bra`
+
+    `DW_OP_bra` is a conditional branch. Its single operand is a 2-byte signed
+    integer constant. This operation pops the top of stack. If the value popped
+    is not the constant 0, the 2-byte constant operand is the number of bytes of
+    the DWARF operation expression to skip forward or backward from the current
+    operation, beginning after the 2-byte constant.
+
+    If the updated position is at one past the end of the last operation, then
+    the operation expression evaluation is complete.
+
+    Otherwise, the DWARF expression is ill-formed if the updated operation
+    position is not in the range of the first to last operation inclusive, or
+    not at the start of an operation.
+
+5.  `DW_OP_call2, DW_OP_call4, DW_OP_call_ref`
+
+    `DW_OP_call2`, `DW_OP_call4`, and `DW_OP_call_ref` perform DWARF procedure
+    calls during evaluation of a DWARF expression.
+
+    `DW_OP_call2` and `DW_OP_call4`, have one operand that is, respectively, a
+    2-byte or 4-byte unsigned offset DR that represents the byte offset of a
+    debugging information entry D relative to the beginning of the current
+    compilation unit.
+
+    `DW_OP_call_ref` has one operand that is a 4-byte unsigned value in the
+    32-bit DWARF format, or an 8-byte unsigned value in the 64-bit DWARF format,
+    that represents the byte offset DR of a debugging information entry D
+    relative to the beginning of the `.debug_info` section that contains the
+    current compilation unit. D may not be in the current compilation unit.
+
+    > NOTE: DWARF Version 5 states that DR can be an offset in a `.debug_info`
+    > section other than the one that contains the current compilation unit. It
+    > states that relocation of references from one executable or shared object
+    > file to another must be performed by the consumer. But given that DR is
+    > defined as an offset in a `.debug_info` section this seems impossible. If
+    > DR was defined as an implementation defined value, then the consumer could
+    > choose to interpret the value in an implementation defined manner to
+    > reference a debug information in another executable or shared object.
+    >
+    > In ELF the `.debug_info` section is in a non-`PT_LOAD` segment so standard
+    > dynamic relocations cannot be used. But even if they were loaded segments
+    > and dynamic relocations were used, DR would need to be the address of D,
+    > not an offset in a `.debug_info` section. That would also need DR to be
+    > the size of a global address. So it would not be possible to use the
+    > 32-bit DWARF format in a 64-bit global address space. In addition, the
+    > consumer would need to determine what executable or shared object the
+    > relocated address was in so it could determine the containing compilation
+    > unit.
+    >
+    > GDB only interprets DR as an offset in the `.debug_info` section that
+    > contains the current compilation unit.
+    >
+    > This comment also applies to `DW_OP_implicit_pointer`.
+
+    <i>Operand interpretation of `DW_OP_call2`, `DW_OP_call4`, and
+    `DW_OP_call_ref` is exactly like that for `DW_FORM_ref2`, `DW_FORM_ref4`,
+    and `DW_FORM_ref_addr`, respectively.</i>
+
+    The call operation is evaluated by:
+
+    - If D has a `DW_AT_location` attribute that is encoded as a `exprloc` that
+      specifies an operation expression E, then execution of the current
+      operation expression continues from the first operation of E. Execution
+      continues until one past the last operation of E is reached, at which
+      point execution continues with the operation following the call operation.
+      The operations of E are evaluated with the same current context, except
+      current compilation unit is the one that contains D and the stack is the
+      same as that being used by the call operation. After the call operation
+      has been evaluated, the stack is therefore as it is left by the evaluation
+      of the operations of E. Since E is evaluated on the same stack as the call
+      operation, E can use, and/or remove entries already on the stack, and can
+      add new entries to the stack.
+
+      <i>Values on the stack at the time of the call may be used as parameters
+      by the called expression and values left on the stack by the called
+      expression may be used as return values by prior agreement between the
+      calling and called expressions.</i>
+
+    - If D has a `DW_AT_location` attribute that is encoded as a `loclist` or
+      `loclistsptr`, then the specified location list expression E is evaluated.
+      The evaluation of E uses the current context, except the result kind is a
+      location description, the compilation unit is the one that contains D, and
+      the initial stack is empty. The location description result is pushed on
+      the stack.
+
+      > NOTE: This rule avoids having to define how to execute a matched
+      > location list entry operation expression on the same stack as the call
+      > when there are multiple matches. But it allows the call to obtain the
+      > location description for a variable or formal parameter which may use a
+      > location list expression.
+      >
+      > An alternative is to treat the case when D has a `DW_AT_location`
+      > attribute that is encoded as a `loclist` or `loclistsptr`, and the
+      > specified location list expression E' matches a single location list
+      > entry with operation expression E, the same as the `exprloc` case and
+      > evaluate on the same stack.
+      >
+      > But this is not attractive as if the attribute is for a variable that
+      > happens to end with a non-singleton stack, it will not simply put a
+      > location description on the stack. Presumably the intent of using
+      > `DW_OP_call*` on a variable or formal parameter debugger information
+      > entry is to push just one location description on the stack. That
+      > location description may have more than one single location description.
+      >
+      > The previous rule for `exprloc` also has the same problem, as normally a
+      > variable or formal parameter location expression may leave multiple
+      > entries on the stack and only return the top entry.
+      >
+      > GDB implements `DW_OP_call*` by always executing E on the same stack. If
+      > the location list has multiple matching entries, it simply picks the
+      > first one and ignores the rest. This seems fundamentally at odds with
+      > the desire to support multiple places for variables.
+      >
+      > So, it feels like `DW_OP_call*` should both support pushing a location
+      > description on the stack for a variable or formal parameter, and also
+      > support being able to execute an operation expression on the same stack.
+      > Being able to specify a different operation expression for different
+      > program locations seems a desirable feature to retain.
+      >
+      > A solution to that is to have a distinct `DW_AT_proc` attribute for the
+      > `DW_TAG_dwarf_procedure` debugging information entry. Then the
+      > `DW_AT_location` attribute expression is always executed separately and
+      > pushes a location description (that may have multiple single location
+      > descriptions), and the `DW_AT_proc` attribute expression is always
+      > executed on the same stack and can leave anything on the stack.
+      >
+      > The `DW_AT_proc` attribute could have the new classes `exprproc`,
+      > `loclistproc`, and `loclistsptrproc` to indicate that the expression is
+      > executed on the same stack. `exprproc` is the same encoding as
+      > `exprloc`. `loclistproc` and `loclistsptrproc` are the same encoding as
+      > their non-`proc` counterparts, except the DWARF is ill-formed if the
+      > location list does not match exactly one location list entry and a
+      > default entry is required. These forms indicate explicitly that the
+      > matched single operation expression must be executed on the same stack.
+      > This is better than ad hoc special rules for `loclistproc` and
+      > `loclistsptrproc` which are currently clearly defined to always return a
+      > location description. The producer then explicitly indicates the intent
+      > through the attribute classes.
+      >
+      > Such a change would be a breaking change for how GDB implements
+      > `DW_OP_call*`. However, are the breaking cases actually occurring in
+      > practice? GDB could implement the current approach for DWARF Version 5,
+      > and the new semantics for DWARF Version 6 which has been done for some
+      > other features.
+      >
+      > Another option is to limit the execution to be on the same stack only to
+      > the evaluation of an expression E that is the value of a
+      > `DW_AT_location` attribute of a `DW_TAG_dwarf_procedure` debugging
+      > information entry. The DWARF would be ill-formed if E is a location list
+      > expression that does not match exactly one location list entry. In all
+      > other cases the evaluation of an expression E that is the value of a
+      > `DW_AT_location` attribute would evaluate E with the current context,
+      > except the result kind is a location description, the compilation unit
+      > is the one that contains D, and the initial stack is empty. The location
+      > description result is pushed on the stack.
+
+    - If D has a `DW_AT_const_value` attribute with a value V, then it is as if
+      a `DW_OP_implicit_value V` operation was executed.
+
+      <i>This allows a call operation to be used to compute the location
+      description for any variable or formal parameter regardless of whether the
+      producer has optimized it to a constant. This is consistent with the
+      `DW_OP_implicit_pointer` operation.</i>
+
+      > NOTE: Alternatively, could deprecate using `DW_AT_const_value` for
+      > `DW_TAG_variable` and `DW_TAG_formal_parameter` debugger information
+      > entries that are constants and instead use `DW_AT_location` with an
+      > operation expression that results in a location description with one
+      > implicit location description. Then this rule would not be required.
+
+    - Otherwise, there is no effect and no changes are made to the stack.
+
+      > NOTE: In DWARF Version 5, if D does not have a `DW_AT_location` then
+      > `DW_OP_call*` is defined to have no effect. It is unclear that this is
+      > the right definition as a producer should be able to rely on using
+      > `DW_OP_call*` to get a location description for any
+      > non-`DW_TAG_dwarf_procedure` debugging information entries. Also, the
+      > producer should not be creating DWARF with `DW_OP_call*` to a
+      > `DW_TAG_dwarf_procedure` that does not have a `DW_AT_location`
+      > attribute. So, should this case be defined as an ill-formed DWARF
+      > expression?
+
+    <i>The `DW_TAG_dwarf_procedure` debugging information entry can be used to
+    define DWARF procedures that can be called.</i>
+
+##### A.2.5.4.3 Value Operations
+
+This section describes the operations that push values on the stack.
+
+Each value stack entry has a type and a literal value. It can represent a
+literal value of any supported base type of the target architecture. The base
+type specifies the size, encoding, and endianity of the literal value.
+
+The base type of value stack entries can be the distinguished generic type.
+
+###### A.2.5.4.3.1 Literal Operations
+
+> NOTE: This section replaces DWARF Version 5 section 2.5.1.1.
+
+The following operations all push a literal value onto the DWARF stack.
+
+Operations other than `DW_OP_const_type` push a value V with the generic type.
+If V is larger than the generic type, then V is truncated to the generic type
+size and the low-order bits used.
+
+1.  `DW_OP_lit0`, `DW_OP_lit1`, ..., `DW_OP_lit31`
+
+    `DW_OP_lit<N>` operations encode an unsigned literal value N from 0 through
+    31, inclusive. They push the value N with the generic type.
+
+2.  `DW_OP_const1u`, `DW_OP_const2u`, `DW_OP_const4u`, `DW_OP_const8u`
+
+    `DW_OP_const<N>u` operations have a single operand that is a 1, 2, 4, or
+    8-byte unsigned integer constant U, respectively. They push the value U with
+    the generic type.
+
+3.  `DW_OP_const1s`, `DW_OP_const2s`, `DW_OP_const4s`, `DW_OP_const8s`
+
+    `DW_OP_const<N>s` operations have a single operand that is a 1, 2, 4, or
+    8-byte signed integer constant S, respectively. They push the value S with
+    the generic type.
+
+4.  `DW_OP_constu`
+
+    `DW_OP_constu` has a single unsigned LEB128 integer operand N. It pushes the
+    value N with the generic type.
+
+5.  `DW_OP_consts`
+
+    `DW_OP_consts` has a single signed LEB128 integer operand N. It pushes the
+    value N with the generic type.
+
+6.  `DW_OP_constx`
+
+    `DW_OP_constx` has a single unsigned LEB128 integer operand that represents
+    a zero-based index into the `.debug_addr` section relative to the value of
+    the `DW_AT_addr_base` attribute of the associated compilation unit. The
+    value N in the `.debug_addr` section has the size of the generic type. It
+    pushes the value N with the generic type.
+
+    <i>The `DW_OP_constx` operation is provided for constants that require
+    link-time relocation but should not be interpreted by the consumer as a
+    relocatable address (for example, offsets to thread-local storage).</i>
+
+7.  `DW_OP_const_type`
+
+    `DW_OP_const_type` has three operands. The first is an unsigned LEB128
+    integer DR that represents the byte offset of a debugging information entry
+    D relative to the beginning of the current compilation unit, that provides
+    the type T of the constant value. The second is a 1-byte unsigned integral
+    constant S. The third is a block of bytes B, with a length equal to S.
+
+    TS is the bit size of the type T. The least significant TS bits of B are
+    interpreted as a value V of the type D. It pushes the value V with the type
+    D.
+
+    The DWARF is ill-formed if D is not a `DW_TAG_base_type` debugging
+    information entry in the current compilation unit, or if TS divided by 8
+    (the byte size) and rounded up to a whole number is not equal to S.
+
+    <i>While the size of the byte block B can be inferred from the type D
+    definition, it is encoded explicitly into the operation so that the
+    operation can be parsed easily without reference to the `.debug_info`
+    section.</i>
+
+###### A.2.5.4.3.2 Arithmetic and Logical Operations
+
+> NOTE: This section is the same as DWARF Version 5 section 2.5.1.4.
+
+###### A.2.5.4.3.3 Type Conversion Operations
+
+> NOTE: This section is the same as DWARF Version 5 section 2.5.1.6.
+
+###### A.2.5.4.3.4 Special Value Operations
+
+> NOTE: This section replaces parts of DWARF Version 5 sections 2.5.1.2,
+  2.5.1.3, and 2.5.1.7.
+
+There are these special value operations currently defined:
+
+1.  `DW_OP_regval_type`
+
+    `DW_OP_regval_type` has two operands. The first is an unsigned LEB128
+    integer that represents a register number R. The second is an unsigned
+    LEB128 integer DR that represents the byte offset of a debugging information
+    entry D relative to the beginning of the current compilation unit, that
+    provides the type T of the register value.
+
+    The operation is equivalent to performing `DW_OP_regx R; DW_OP_deref_type
+    DR`.
+
+    > NOTE: Should DWARF allow the type T to be a larger size than the size of
+    > the register R? Restricting a larger bit size avoids any issue of
+    > conversion as the, possibly truncated, bit contents of the register is
+    > simply interpreted as a value of T. If a conversion is wanted it can be
+    > done explicitly using a `DW_OP_convert` operation.
+    >
+    > GDB has a per register hook that allows a target specific conversion on a
+    > register by register basis. It defaults to truncation of bigger registers.
+    > Removing use of the target hook does not cause any test failures in common
+    > architectures. If the compiler for a target architecture did want some
+    > form of conversion, including a larger result type, it could always
+    > explicitly used the `DW_OP_convert` operation.
+    >
+    > If T is a larger type than the register size, then the default GDB
+    > register hook reads bytes from the next register (or reads out of bounds
+    > for the last register!). Removing use of the target hook does not cause
+    > any test failures in common architectures (except an illegal hand written
+    > assembly test). If a target architecture requires this behavior, these
+    > extensions allow a composite location description to be used to combine
+    > multiple registers.
+
+2.  `DW_OP_deref`
+
+    S is the bit size of the generic type divided by 8 (the byte size) and
+    rounded up to a whole number. DR is the offset of a hypothetical debug
+    information entry D in the current compilation unit for a base type of the
+    generic type.
+
+    The operation is equivalent to performing `DW_OP_deref_type S, DR`.
+
+3.  `DW_OP_deref_size`
+
+    `DW_OP_deref_size` has a single 1-byte unsigned integral constant that
+    represents a byte result size S.
+
+    TS is the smaller of the generic type bit size and S scaled by 8 (the byte
+    size). If TS is smaller than the generic type bit size then T is an unsigned
+    integral type of bit size TS, otherwise T is the generic type. DR is the
+    offset of a hypothetical debug information entry D in the current
+    compilation unit for a base type T.
+
+    > NOTE: Truncating the value when S is larger than the generic type matches
+    > what GDB does. This allows the generic type size to not be an integral
+    > byte size. It does allow S to be arbitrarily large. Should S be restricted
+    > to the size of the generic type rounded up to a multiple of 8?
+
+    The operation is equivalent to performing `DW_OP_deref_type S, DR`, except
+    if T is not the generic type, the value V pushed is zero-extended to the
+    generic type bit size and its type changed to the generic type.
+
+4.  `DW_OP_deref_type`
+
+    `DW_OP_deref_type` has two operands. The first is a 1-byte unsigned integral
+    constant S. The second is an unsigned LEB128 integer DR that represents the
+    byte offset of a debugging information entry D relative to the beginning of
+    the current compilation unit, that provides the type T of the result value.
+
+    TS is the bit size of the type T.
+
+    <i>While the size of the pushed value V can be inferred from the type T, it
+    is encoded explicitly as the operand S so that the operation can be parsed
+    easily without reference to the `.debug_info` section.</i>
+
+    > NOTE: It is unclear why the operand S is needed. Unlike
+    > `DW_OP_const_type`, the size is not needed for parsing. Any evaluation
+    > needs to get the base type T to push with the value to know its encoding
+    > and bit size.
+
+    It pops one stack entry that must be a location description L.
+
+    A value V of TS bits is retrieved from the location storage LS specified by
+    one of the single location descriptions SL of L.
+
+    <i>If L, or the location description of any composite location description
+    part that is a subcomponent of L, has more than one single location
+    description, then any one of them can be selected as they are required to
+    all have the same value. For any single location description SL, bits are
+    retrieved from the associated storage location starting at the bit offset
+    specified by SL. For a composite location description, the retrieved bits
+    are the concatenation of the N bits from each composite location part PL,
+    where N is limited to the size of PL.</i>
+
+    V is pushed on the stack with the type T.
+
+    > NOTE: This definition makes it an evaluation error if L is a register
+    > location description that has less than TS bits remaining in the register
+    > storage. Particularly since these extensions extend location descriptions
+    > to have a bit offset, it would be odd to define this as performing sign
+    > extension based on the type, or be target architecture dependent, as the
+    > number of remaining bits could be any number. This matches the GDB
+    > implementation for `DW_OP_deref_type`.
+    >
+    > These extensions define `DW_OP_*breg*` in terms of `DW_OP_regval_type`.
+    > `DW_OP_regval_type` is defined in terms of `DW_OP_regx`, which uses a 0
+    > bit offset, and `DW_OP_deref_type`. Therefore, it requires the register
+    > size to be greater or equal to the address size of the address space. This
+    > matches the GDB implementation for `DW_OP_*breg*`.
+
+    The DWARF is ill-formed if D is not in the current compilation unit, D is
+    not a `DW_TAG_base_type` debugging information entry, or if TS divided by 8
+    (the byte size) and rounded up to a whole number is not equal to S.
+
+    > NOTE: This definition allows the base type to be a bit size since there
+    > seems no reason to restrict it.
+
+    It is an evaluation error if any bit of the value is retrieved from the
+    undefined location storage or the offset of any bit exceeds the size of the
+    location storage LS specified by any single location description SL of L.
+
+    See [2.5.4.4.5 Implicit Location Description
+    Operations](#implicit-location-description-operations) for special
+    rules concerning implicit location descriptions created by the
+    `DW_OP_implicit_pointer` operation.
+
+5.  `DW_OP_xderef`
+
+    `DW_OP_xderef` pops two stack entries. The first must be an integral type
+    value that represents an address A. The second must be an integral type
+    value that represents a target architecture specific address space
+    identifier AS.
+
+    The address size S is defined as the address bit size of the target
+    architecture specific address space that corresponds to AS.
+
+    A is adjusted to S bits by zero extending if necessary, and then treating
+    the least significant S bits as an unsigned value A'.
+
+    It creates a location description L with one memory location description SL.
+    SL specifies the memory location storage LS that corresponds to AS with a
+    bit offset equal to A' scaled by 8 (the byte size).
+
+    If AS is an address space that is specific to context elements, then LS
+    corresponds to the location storage associated with the current context.
+
+    <i>For example, if AS is for per thread storage then LS is the location
+    storage for the current thread. Therefore, if L is accessed by an operation,
+    the location storage selected when the location description was created is
+    accessed, and not the location storage associated with the current context
+    of the access operation.</i>
+
+    The DWARF expression is ill-formed if AS is not one of the values defined by
+    the target architecture specific `DW_ASPACE_*` values.
+
+    The operation is equivalent to popping A and AS, pushing L, and then
+    performing `DW_OP_deref`. The value V retrieved is left on the stack with
+    the generic type.
+
+6.  `DW_OP_xderef_size`
+
+    `DW_OP_xderef_size` has a single 1-byte unsigned integral constant that
+    represents a byte result size S.
+
+    It pops two stack entries. The first must be an integral type value
+    that represents an address A. The second must be an integral type
+    value that represents a target architecture specific address space
+    identifier AS.
+
+    It creates a location description L as described for `DW_OP_xderef`.
+
+    The operation is equivalent to popping A and AS, pushing L, and then
+    performing `DW_OP_deref_size S` . The zero-extended value V retrieved is
+    left on the stack with the generic type.
+
+7.  `DW_OP_xderef_type`
+
+    `DW_OP_xderef_type` has two operands. The first is a 1-byte unsigned
+    integral constant S. The second operand is an unsigned LEB128 integer DR
+    that represents the byte offset of a debugging information entry D relative
+    to the beginning of the current compilation unit, that provides the type T
+    of the result value.
+
+    It pops two stack entries. The first must be an integral type value that
+    represents an address A. The second must be an integral type value that
+    represents a target architecture specific address space identifier AS.
+
+    It creates a location description L as described for `DW_OP_xderef`.
+
+    The operation is equivalent to popping A and AS, pushing L, and then
+    performing `DW_OP_deref_type DR` . The value V retrieved is left on the
+    stack with the type T.
+
+8.  `DW_OP_entry_value` <i>Deprecated</i>
+
+    `DW_OP_entry_value` pushes the value of an expression that is evaluated in
+    the context of the calling frame.
+
+    <i>It may be used to determine the value of arguments on entry to the
+    current call frame provided they are not clobbered.</i>
+
+    It has two operands. The first is an unsigned LEB128 integer S. The second
+    is a block of bytes, with a length equal S, interpreted as a DWARF operation
+    expression E.
+
+    E is evaluated with the current context, except the result kind is
+    unspecified, the call frame is the one that called the current frame, the
+    program location is the call site in the calling frame, the object is
+    unspecified, and the initial stack is empty. The calling frame information
+    is obtained by virtually unwinding the current call frame using the call
+    frame information (see [6.4 Call Frame
+    Information](#call-frame-information)).
+
+    If the result of E is a location description L (see [2.5.4.4.4 Register
+    Location Description
+    Operations](#register-location-description-operations)), and the last
+    operation executed by E is a `DW_OP_reg*` for register R with a target
+    architecture specific base type of T, then the contents of the register are
+    retrieved as if a `DW_OP_deref_type DR` operation was performed where DR is
+    the offset of a hypothetical debug information entry in the current
+    compilation unit for T. The resulting value V s pushed on the stack.
+
+    <i>Using `DW_OP_reg*` provides a more compact form for the case where the
+    value was in a register on entry to the subprogram.</i>
+
+    > NOTE: It is unclear how this provides a more compact expression, as
+    > `DW_OP_regval_type` could be used which is marginally larger.
+
+    If the result of E is a value V, then V is pushed on the stack.
+
+    Otherwise, the DWARF expression is ill-formed.
+
+    <i>The `DW_OP_entry_value` operation is deprecated as its main usage is
+    provided by other means. DWARF Version 5 added the
+    `DW_TAG_call_site_parameter` debugger information entry for call sites that
+    has `DW_AT_call_value`, `DW_AT_call_data_location`, and
+    `DW_AT_call_data_value` attributes that provide DWARF expressions to compute
+    actual parameter values at the time of the call, and requires the producer
+    to ensure the expressions are valid to evaluate even when virtually
+    unwound.</i>
+
+    > NOTE: GDB only implements `DW_OP_entry_value` when E is exactly
+    > `DW_OP_reg*` or `DW_OP_breg*; DW_OP_deref*`.
+
+##### A.2.5.4.4 Location Description Operations
+
+This section describes the operations that push location descriptions on the
+stack.
+
+###### A.2.5.4.4.1 General Location Description Operations
+
+> NOTE: This section replaces part of DWARF Version 5 section 2.5.1.3.
+
+1.  `DW_OP_push_object_address`
+
+    `DW_OP_push_object_address` pushes the location description L of the current
+    object.
+
+    <i>This object may correspond to an independent variable that is part of a
+    user presented expression that is being evaluated. The object location
+    description may be determined from the variable's own debugging information
+    entry or it may be a component of an array, structure, or class whose
+    address has been dynamically determined by an earlier step during user
+    expression evaluation.</i>
+
+    <i>This operation provides explicit functionality (especially for arrays
+    involving descriptors) that is analogous to the implicit push of the base
+    location description of a structure prior to evaluation of a
+    `DW_AT_data_member_location` to access a data member of a structure.</i>
+
+    > NOTE: This operation could be removed and the object location description
+    > specified as the initial stack as for `DW_AT_data_member_location`.
+    >
+    > Or this operation could be used instead of needing to specify an initial
+    > stack. The latter approach is more composable as access to the object may
+    > be needed at any point of the expression, and passing it as the initial
+    > stack requires the entire expression to be aware where on the stack it is.
+    > If this were done, ``DW_AT_use_location`` would require a
+    > ``DW_OP_push_object2_address`` operation for the second object.
+    >
+    > Or a more general way to pass an arbitrary number of arguments in and an
+    > operation to get the Nth one such as ``DW_OP_arg N``. A vector of
+    > arguments would then be passed in the expression context rather than an
+    > initial stack. This could also resolve the issues with ``DW_OP_call*`` by
+    > allowing a specific number of arguments passed in and returned to be
+    > specified. The ``DW_OP_call*`` operation could then always execute on a
+    > separate stack: the number of arguments would be specified in a new call
+    > operation and taken from the callers stack, and similarly the number of
+    > return results specified and copied from the called stack back to the
+    > callee stack when the called expression was complete.
+    >
+    > The only attribute that specifies a current object is
+    > `DW_AT_data_location` so the non-normative text seems to overstate how
+    > this is being used. Or are there other attributes that need to state they
+    > pass an object?
+
+###### A.2.5.4.4.2 Undefined Location Description Operations
+
+> NOTE: This section replaces DWARF Version 5 section 2.6.1.1.1.
+
+<i>The undefined location storage represents a piece or all of an object that is
+present in the source but not in the object code (perhaps due to optimization).
+Neither reading nor writing to the undefined location storage is meaningful.</i>
+
+An undefined location description specifies the undefined location storage.
+There is no concept of the size of the undefined location storage, nor of a bit
+offset for an undefined location description. The `DW_OP_*piece` operations can
+implicitly specify an undefined location description, allowing any size and
+offset to be specified, and results in a part with all undefined bits.
+
+###### A.2.5.4.4.3 Memory Location Description Operations
+
+> NOTE: This section replaces parts of DWARF Version 5 section 2.5.1.1, 2.5.1.2,
+> 2.5.1.3, and 2.6.1.1.2.
+
+Each of the target architecture specific address spaces has a corresponding
+memory location storage that denotes the linear addressable memory of that
+address space. The size of each memory location storage corresponds to the range
+of the addresses in the corresponding address space.
+
+<i>It is target architecture defined how address space location storage maps to
+target architecture physical memory. For example, they may be independent
+memory, or more than one location storage may alias the same physical memory
+possibly at different offsets and with different interleaving. The mapping may
+also be dictated by the source language address classes.</i>
+
+A memory location description specifies a memory location storage. The bit
+offset corresponds to a bit position within a byte of the memory. Bits accessed
+using a memory location description, access the corresponding target
+architecture memory starting at the bit position within the byte specified by
+the bit offset.
+
+A memory location description that has a bit offset that is a multiple of 8 (the
+byte size) is defined to be a byte address memory location description. It has a
+memory byte address A that is equal to the bit offset divided by 8.
+
+A memory location description that does not have a bit offset that is a multiple
+of 8 (the byte size) is defined to be a bit field memory location description.
+It has a bit position B equal to the bit offset modulo 8, and a memory byte
+address A equal to the bit offset minus B that is then divided by 8.
+
+The address space AS of a memory location description is defined to be the
+address space that corresponds to the memory location storage associated with
+the memory location description.
+
+A location description that is comprised of one byte address memory location
+description SL is defined to be a memory byte address location description. It
+has a byte address equal to A and an address space equal to AS of the
+corresponding SL.
+
+`DW_ASPACE_none` is defined as the target architecture default address space.
+
+If a stack entry is required to be a location description, but it is a value V
+with the generic type, then it is implicitly converted to a location description
+L with one memory location description SL. SL specifies the memory location
+storage that corresponds to the target architecture default address space with a
+bit offset equal to V scaled by 8 (the byte size).
+
+> NOTE: If it is wanted to allow any integral type value to be implicitly
+> converted to a memory location description in the target architecture default
+> address space:
+>
+> > If a stack entry is required to be a location description, but is a value V
+> > with an integral type, then it is implicitly converted to a location
+> > description L with a one memory location description SL. If the type size of
+> > V is less than the generic type size, then the value V is zero extended to
+> > the size of the generic type. The least significant generic type size bits
+> > are treated as an unsigned value to be used as an address A. SL specifies
+> > memory location storage corresponding to the target architecture default
+> > address space with a bit offset equal to A scaled by 8 (the byte size).
+>
+> The implicit conversion could also be defined as target architecture specific.
+> For example, GDB checks if V is an integral type. If it is not it gives an
+> error. Otherwise, GDB zero-extends V to 64 bits. If the GDB target defines a
+> hook function, then it is called. The target specific hook function can modify
+> the 64-bit value, possibly sign extending based on the original value type.
+> Finally, GDB treats the 64-bit value V as a memory location address.
+
+If a stack entry is required to be a location description, but it is an implicit
+pointer value IPV with the target architecture default address space, then it is
+implicitly converted to a location description with one single location
+description specified by IPV. See [2.5.4.4.5 Implicit Location Description
+Operations](#implicit-location-description-operations).
+
+If a stack entry is required to be a value, but it is a location description L
+with one memory location description SL in the target architecture default
+address space with a bit offset B that is a multiple of 8, then it is implicitly
+converted to a value equal to B divided by 8 (the byte size) with the generic
+type.
+
+1.  `DW_OP_addr`
+
+    `DW_OP_addr` has a single byte constant value operand, which has the size of
+    the generic type, that represents an address A.
+
+    It pushes a location description L with one memory location description SL
+    on the stack. SL specifies the memory location storage corresponding to the
+    target architecture default address space with a bit offset equal to A
+    scaled by 8 (the byte size).
+
+    <i>If the DWARF is part of a code object, then A may need to be relocated.
+    For example, in the ELF code object format, A must be adjusted by the
+    difference between the ELF segment virtual address and the virtual address
+    at which the segment is loaded.</i>
+
+2.  `DW_OP_addrx`
+
+    `DW_OP_addrx` has a single unsigned LEB128 integer operand that represents a
+    zero-based index into the `.debug_addr` section relative to the value of the
+    `DW_AT_addr_base` attribute of the associated compilation unit. The address
+    value A in the `.debug_addr` section has the size of the generic type.
+
+    It pushes a location description L with one memory location description SL
+    on the stack. SL specifies the memory location storage corresponding to the
+    target architecture default address space with a bit offset equal to A
+    scaled by 8 (the byte size).
+
+    <i>If the DWARF is part of a code object, then A may need to be relocated.
+    For example, in the ELF code object format, A must be adjusted by the
+    difference between the ELF segment virtual address and the virtual address
+    at which the segment is loaded.</i>
+
+3.  `DW_OP_form_tls_address`
+
+    `DW_OP_form_tls_address` pops one stack entry that must be an integral type
+    value and treats it as a thread-local storage address TA.
+
+    It pushes a location description L with one memory location description SL
+    on the stack. SL is the target architecture specific memory location
+    description that corresponds to the thread-local storage address TA.
+
+    The meaning of the thread-local storage address TA is defined by the
+    run-time environment. If the run-time environment supports multiple
+    thread-local storage blocks for a single thread, then the block
+    corresponding to the executable or shared library containing this DWARF
+    expression is used.
+
+    <i>Some implementations of C, C++, Fortran, and other languages support a
+    thread-local storage class. Variables with this storage class have distinct
+    values and addresses in distinct threads, much as automatic variables have
+    distinct values and addresses in each subprogram invocation. Typically,
+    there is a single block of storage containing all thread-local variables
+    declared in the main executable, and a separate block for the variables
+    declared in each shared library. Each thread-local variable can then be
+    accessed in its block using an identifier. This identifier is typically a
+    byte offset into the block and pushed onto the DWARF stack by one of the
+    `DW_OP_const*` operations prior to the `DW_OP_form_tls_address` operation.
+    Computing the address of the appropriate block can be complex (in some
+    cases, the compiler emits a function call to do it), and difficult to
+    describe using ordinary DWARF location descriptions. Instead of forcing
+    complex thread-local storage calculations into the DWARF expressions, the
+    `DW_OP_form_tls_address` allows the consumer to perform the computation
+    based on the target architecture specific run-time environment.</i>
+
+4.  `DW_OP_call_frame_cfa`
+
+    `DW_OP_call_frame_cfa` pushes the location description L of the Canonical
+    Frame Address (CFA) of the current subprogram, obtained from the call frame
+    information on the stack. See [6.4 Call Frame
+    Information](#call-frame-information).
+
+    <i>Although the value of the `DW_AT_frame_base` attribute of the debugger
+    information entry corresponding to the current subprogram can be computed
+    using a location list expression, in some cases this would require an
+    extensive location list because the values of the registers used in
+    computing the CFA change during a subprogram execution. If the call frame
+    information is present, then it already encodes such changes, and it is
+    space efficient to reference that using the `DW_OP_call_frame_cfa`
+    operation.</i>
+
+5.  `DW_OP_fbreg`
+
+    `DW_OP_fbreg` has a single signed LEB128 integer operand that represents a
+    byte displacement B.
+
+    The location description L for the <i>frame base</i> of the current
+    subprogram is obtained from the `DW_AT_frame_base` attribute of the debugger
+    information entry corresponding to the current subprogram as described in
+    [3.3.5 Low-Level Information](#low-level-information).
+
+    The location description L is updated by bit offset B scaled by 8 (the byte
+    size) and pushed on the stack.
+
+6.  `DW_OP_breg0`, `DW_OP_breg1`, ..., `DW_OP_breg31`
+
+    The `DW_OP_breg<N>` operations encode the numbers of up to 32 registers,
+    numbered from 0 through 31, inclusive. The register number R corresponds to
+    the N in the operation name.
+
+    They have a single signed LEB128 integer operand that represents a byte
+    displacement B.
+
+    The address space identifier AS is defined as the one corresponding to the
+    target architecture specific default address space.
+
+    The address size S is defined as the address bit size of the target
+    architecture specific address space corresponding to AS.
+
+    The contents of the register specified by R are retrieved as if a
+    `DW_OP_regval_type R, DR` operation was performed where DR is the offset of
+    a hypothetical debug information entry in the current compilation unit for
+    an unsigned integral base type of size S bits. B is added and the least
+    significant S bits are treated as an unsigned value to be used as an address
+    A.
+
+    They push a location description L comprising one memory location
+    description LS on the stack. LS specifies the memory location storage that
+    corresponds to AS with a bit offset equal to A scaled by 8 (the byte size).
+
+7.  `DW_OP_bregx`
+
+    `DW_OP_bregx` has two operands. The first is an unsigned LEB128 integer that
+    represents a register number R. The second is a signed LEB128 integer that
+    represents a byte displacement B.
+
+    The action is the same as for `DW_OP_breg<N>`, except that R is used as the
+    register number and B is used as the byte displacement.
+
+###### A.2.5.4.4.4 Register Location Description Operations
+
+> NOTE: This section replaces DWARF Version 5 section 2.6.1.1.3.
+
+There is a register location storage that corresponds to each of the target
+architecture registers. The size of each register location storage corresponds
+to the size of the corresponding target architecture register.
+
+A register location description specifies a register location storage. The bit
+offset corresponds to a bit position within the register. Bits accessed using a
+register location description access the corresponding target architecture
+register starting at the specified bit offset.
+
+1.  `DW_OP_reg0`, `DW_OP_reg1`, ..., `DW_OP_reg31`
+
+    `DW_OP_reg<N>` operations encode the numbers of up to 32 registers, numbered
+    from 0 through 31, inclusive. The target architecture register number R
+    corresponds to the N in the operation name.
+
+    The operation is equivalent to performing `DW_OP_regx R`.
+
+2.  `DW_OP_regx`
+
+    `DW_OP_regx` has a single unsigned LEB128 integer operand that represents a
+    target architecture register number R.
+
+    If the current call frame is the top call frame, it pushes a location
+    description L that specifies one register location description SL on the
+    stack. SL specifies the register location storage that corresponds to R with
+    a bit offset of 0 for the current thread.
+
+    If the current call frame is not the top call frame, call frame information
+    (see [6.4 Call Frame Information](#call-frame-information)) is used to
+    determine the location description that holds the register for the current
+    call frame and current program location of the current thread. The resulting
+    location description L is pushed.
+
+    <i>Note that if call frame information is used, the resulting location
+    description may be register, memory, or undefined.</i>
+
+    <i>An implementation may evaluate the call frame information immediately, or
+    may defer evaluation until L is accessed by an operation. If evaluation is
+    deferred, R and the current context can be recorded in L. When accessed, the
+    recorded context is used to evaluate the call frame information, not the
+    current context of the access operation.</i>
+
+<i>These operations obtain a register location. To fetch the contents of a
+register, it is necessary to use `DW_OP_regval_type`, use one of the
+`DW_OP_breg*` register-based addressing operations, or use `DW_OP_deref*` on a
+register location description.</i>
+
+###### A.2.5.4.4.5 Implicit Location Description Operations
+
+> NOTE: This section replaces DWARF Version 5 section 2.6.1.1.4.
+
+Implicit location storage represents a piece or all of an object which has no
+actual location in the program but whose contents are nonetheless known, either
+as a constant or can be computed from other locations and values in the program.
+
+An implicit location description specifies an implicit location storage. The bit
+offset corresponds to a bit position within the implicit location storage. Bits
+accessed using an implicit location description, access the corresponding
+implicit storage value starting at the bit offset.
+
+1.  `DW_OP_implicit_value`
+
+    `DW_OP_implicit_value` has two operands. The first is an unsigned LEB128
+    integer that represents a byte size S. The second is a block of bytes with a
+    length equal to S treated as a literal value V.
+
+    An implicit location storage LS is created with the literal value V and a
+    size of S.
+
+    It pushes location description L with one implicit location description SL
+    on the stack. SL specifies LS with a bit offset of 0.
+
+2.  `DW_OP_stack_value`
+
+    `DW_OP_stack_value` pops one stack entry that must be a value V.
+
+    An implicit location storage LS is created with the literal value V using
+    the size, encoding, and enianity specified by V's base type.
+
+    It pushes a location description L with one implicit location description SL
+    on the stack. SL specifies LS with a bit offset of 0.
+
+    <i>The `DW_OP_stack_value` operation specifies that the object does not
+    exist in memory, but its value is nonetheless known. In this form, the
+    location description specifies the actual value of the object, rather than
+    specifying the memory or register storage that holds the value.</i>
+
+    See [2.5.4.4.5 Implicit Location Description
+    Operations](#implicit-location-description-operations) for special
+    rules concerning implicit pointer values produced by dereferencing implicit
+    location descriptions created by the `DW_OP_implicit_pointer` operation.
+
+    > NOTE: Since location descriptions are allowed on the stack, the
+    > `DW_OP_stack_value` operation no longer terminates the DWARF operation
+    > expression execution as in DWARF Version 5.
+
+3.  `DW_OP_implicit_pointer`
+
+    <i>An optimizing compiler may eliminate a pointer, while still retaining the
+    value that the pointer addressed. `DW_OP_implicit_pointer` allows a producer
+    to describe this value.</i>
+
+    <i>`DW_OP_implicit_pointer` specifies an object is a pointer to the target
+    architecture default address space that cannot be represented as a real
+    pointer, even though the value it would point to can be described. In this
+    form, the location description specifies a debugging information entry that
+    represents the actual location description of the object to which the
+    pointer would point. Thus, a consumer of the debug information would be able
+    to access the dereferenced pointer, even when it cannot access the pointer
+    itself.</i>
+
+    `DW_OP_implicit_pointer` has two operands. The first operand is a 4-byte
+    unsigned value in the 32-bit DWARF format, or an 8-byte unsigned value in
+    the 64-bit DWARF format, that represents the byte offset DR of a debugging
+    information entry D relative to the beginning of the `.debug_info` section
+    that contains the current compilation unit. The second operand is a signed
+    LEB128 integer that represents a byte displacement B.
+
+    <i>Note that D may not be in the current compilation unit.</i>
+
+    <i>The first operand interpretation is exactly like that for
+    `DW_FORM_ref_addr`.</i>
+
+    The address space identifier AS is defined as the one corresponding to the
+    target architecture specific default address space.
+
+    The address size S is defined as the address bit size of the target
+    architecture specific address space corresponding to AS.
+
+    An implicit location storage LS is created with the debugging information
+    entry D, address space AS, and size of S.
+
+    It pushes a location description L that comprises one implicit location
+    description SL on the stack. SL specifies LS with a bit offset of 0.
+
+    It is an evaluation error if a `DW_OP_deref*` operation pops a location
+    description L', and retrieves S bits, such that any retrieved bits come from
+    an implicit location storage that is the same as LS, unless both the
+    following conditions are met:
+
+    1.  All retrieved bits come from an implicit location description that
+        refers to an implicit location storage that is the same as LS.
+
+        <i>Note that all bits do not have to come from the same implicit
+        location description, as L' may involve composite location
+        descriptors.</i>
+
+    2.  The bits come from consecutive ascending offsets within their respective
+        implicit location storage.
+
+    <i>These rules are equivalent to retrieving the complete contents of LS.</i>
+
+    If both the above conditions are met, then the value V pushed by the
+    `DW_OP_deref*` operation is an implicit pointer value IPV with a target
+    architecture specific address space of AS, a debugging information entry of
+    D, and a base type of T. If AS is the target architecture default address
+    space, then T is the generic type. Otherwise, T is a target architecture
+    specific integral type with a bit size equal to S.
+
+    If IPV is either implicitly converted to a location description (only done
+    if AS is the target architecture default address space), then the resulting
+    location description RL is:
+
+    - If D has a `DW_AT_location` attribute, the DWARF expression E from the
+      `DW_AT_location` attribute is evaluated with the current context, except
+      that the result kind is a location description, the compilation unit is
+      the one that contains D, the object is unspecified, and the initial stack
+      is empty. RL is the expression result.
+
+      <i>Note that E is evaluated with the context of the expression accessing
+      IPV, and not the context of the expression that contained the
+      `DW_OP_implicit_pointer` operation that created L.</i>
+
+    - If D has a `DW_AT_const_value` attribute, then an implicit location
+      storage RLS is created from the `DW_AT_const_value` attribute's value with
+      a size matching the size of the `DW_AT_const_value` attribute's value. RL
+      comprises one implicit location description SRL. SRL specifies RLS with a
+      bit offset of 0.
+
+      > NOTE: If using `DW_AT_const_value` for variables and formal parameters
+      > is deprecated and instead `DW_AT_location` is used with an implicit
+      > location description, then this rule would not be required.
+
+    - Otherwise, it is an evaluation error.
+
+    The location description RL is updated by bit offset B scaled by 8 (the byte
+    size).
+
+    If a `DW_OP_stack_value` operation pops a value that is the same as IPV,
+    then it pushes a location description that is the same as L.
+
+    It is an evaluation error if LS or IPV is accessed in any other manner.
+
+    <i>The restrictions on how an implicit pointer location description created
+    by `DW_OP_implicit_pointer` can be used are to simplify the DWARF consumer.
+    Similarly, for an implicit pointer value created by `DW_OP_deref*` and
+    `DW_OP_stack_value`.</i>
+
+<i>Typically a `DW_OP_implicit_pointer` operation is used in a DWARF expression
+E<sub>1</sub> of a `DW_TAG_variable` or `DW_TAG_formal_parameter` debugging
+information entry D<sub>1</sub>'s `DW_AT_location` attribute. The debugging
+information entry referenced by the `DW_OP_implicit_pointer` operation is
+typically itself a `DW_TAG_variable` or `DW_TAG_formal_parameter` debugging
+information entry D<sub>2</sub> whose `DW_AT_location` attribute gives a second
+DWARF expression E<sub>2</sub>.</i>
+
+<i>D<sub>1</sub> and E<sub>1</sub> are describing the location of a pointer type
+object. D<sub>2</sub> and E<sub>2</sub> are describing the location of the
+object pointed to by that pointer object.</i>
+
+<i>However, D<sub>2</sub> may be any debugging information entry that contains a
+`DW_AT_location` or `DW_AT_const_value` attribute (for example,
+`DW_TAG_dwarf_procedure`). By using E<sub>2</sub>, a consumer can reconstruct
+the value of the object when asked to dereference the pointer described by
+E<sub>1</sub> which contains the `DW_OP_implicit_pointer` operation.</i>
+
+###### A.2.5.4.4.6 Composite Location Description Operations
+
+> NOTE: This section replaces DWARF Version 5 section 2.6.1.2.
+
+A composite location storage represents an object or value which may be
+contained in part of another location storage or contained in parts of more than
+one location storage.
+
+Each part has a part location description L and a part bit size S. L can have
+one or more single location descriptions SL. If there are more than one SL then
+that indicates that part is located in more than one place. The bits of each
+place of the part comprise S contiguous bits from the location storage LS
+specified by SL starting at the bit offset specified by SL. All the bits must be
+within the size of LS or the DWARF expression is ill-formed.
+
+A composite location storage can have zero or more parts. The parts are
+contiguous such that the zero-based location storage bit index will range over
+each part with no gaps between them. Therefore, the size of a composite location
+storage is the sum of the size of its parts. The DWARF expression is ill-formed
+if the size of the contiguous location storage is larger than the size of the
+memory location storage corresponding to the largest target architecture
+specific address space.
+
+A composite location description specifies a composite location storage. The bit
+offset corresponds to a bit position within the composite location storage.
+
+There are operations that create a composite location storage.
+
+There are other operations that allow a composite location storage to be
+incrementally created. Each part is created by a separate operation. There may
+be one or more operations to create the final composite location storage. A
+series of such operations describes the parts of the composite location storage
+that are in the order that the associated part operations are executed.
+
+To support incremental creation, a composite location storage can be in an
+incomplete state. When an incremental operation operates on an incomplete
+composite location storage, it adds a new part.
+
+A composite location description that specifies a composite location storage
+that is incomplete is termed an incomplete composite location description. A
+composite location description that specifies a composite location storage that
+is complete is termed a complete composite location description.
+
+If the top stack entry is a location description that has one incomplete
+composite location description SL after the execution of an operation expression
+has completed, SL is converted to a complete composite location description.
+
+<i>Note that this conversion does not happen after the completion of an
+operation expression that is evaluated on the same stack by the `DW_OP_call*`
+operations. Such executions are not a separate evaluation of an operation
+expression, but rather the continued evaluation of the same operation expression
+that contains the `DW_OP_call*` operation.</i>
+
+If a stack entry is required to be a location description L, but L has an
+incomplete composite location description, then the DWARF expression is
+ill-formed. The exception is for the operations involved in incrementally
+creating a composite location description as described below.
+
+<i>Note that a DWARF operation expression may arbitrarily compose composite
+location descriptions from any other location description, including those that
+have multiple single location descriptions, and those that have composite
+location descriptions.</i>
+
+<i>The incremental composite location description operations are defined to be
+compatible with the definitions in DWARF Version 5.</i>
+
+1.  `DW_OP_piece`
+
+    `DW_OP_piece` has a single unsigned LEB128 integer that represents a byte
+    size S.
+
+    The action is based on the context:
+
+    - If the stack is empty, then a location description L comprised of one
+      incomplete composite location description SL is pushed on the stack.
+
+      An incomplete composite location storage LS is created with a single part
+      P. P specifies a location description PL and has a bit size of S scaled by
+      8 (the byte size). PL is comprised of one undefined location description
+      PSL.
+
+      SL specifies LS with a bit offset of 0.
+
+    - Otherwise, if the top stack entry is a location description L comprised of
+      one incomplete composite location description SL, then the incomplete
+      composite location storage LS that SL specifies is updated to append a new
+      part P. P specifies a location description PL and has a bit size of S
+      scaled by 8 (the byte size). PL is comprised of one undefined location
+      description PSL. L is left on the stack.
+    - Otherwise, if the top stack entry is a location description or can be
+      converted to one, then it is popped and treated as a part location
+      description PL. Then:
+
+      - If the top stack entry (after popping PL) is a location description L
+        comprised of one incomplete composite location description SL, then the
+        incomplete composite location storage LS that SL specifies is updated to
+        append a new part P. P specifies the location description PL and has a
+        bit size of S scaled by 8 (the byte size). L is left on the stack.
+      - Otherwise, a location description L comprised of one
+        incomplete composite location description SL is pushed on
+        the stack.
+
+        An incomplete composite location storage LS is created with a single
+        part P. P specifies the location description PL and has a bit size of S
+        scaled by 8 (the byte size).
+
+        SL specifies LS with a bit offset of 0.
+
+    - Otherwise, the DWARF expression is ill-formed
+
+    <i>Many compilers store a single variable in sets of registers or store a
+    variable partially in memory and partially in registers. `DW_OP_piece`
+    provides a way of describing where a part of a variable is located.</i>
+
+    <i>The evaluation rules for the `DW_OP_piece` operation allow it to be
+    compatible with the DWARF Version 5 definition.</i>
+
+    > NOTE: Since these extensions allow location descriptions to be entries on
+    > the stack, a simpler operation to create composite location descriptions
+    > could be defined. For example, just one operation that specifies how many
+    > parts, and pops pairs of stack entries for the part size and location
+    > description. Not only would this be a simpler operation and avoid the
+    > complexities of incomplete composite location descriptions, but it may
+    > also have a smaller encoding in practice. However, the desire for
+    > compatibility with DWARF Version 5 is likely a stronger consideration.
+
+2.  `DW_OP_bit_piece`
+
+    `DW_OP_bit_piece` has two operands. The first is an unsigned LEB128 integer
+    that represents the part bit size S. The second is an unsigned LEB128
+    integer that represents a bit displacement B.
+
+    The action is the same as for `DW_OP_piece`, except that any part created
+    has the bit size S, and the location description PL of any created part is
+    updated by a bit offset B.
+
+    <i>`DW_OP_bit_piece` is used instead of `DW_OP_piece` when the piece to be
+    assembled is not byte-sized or is not at the start of the part location
+    description.</i>
+
+#### A.2.5.5 DWARF Location List Expressions
+
+> NOTE: This section replaces DWARF Version 5 section 2.6.2.
+
+<i>To meet the needs of recent computer architectures and optimization
+techniques, debugging information must be able to describe the location of an
+object whose location changes over the object's lifetime, and may reside at
+multiple locations during parts of an object's lifetime. Location list
+expressions are used in place of operation expressions whenever the object whose
+location is being described has these requirements.</i>
+
+A location list expression consists of a series of location list entries. Each
+location list entry is one of the following kinds:
+
+1.  <i>Bounded location description</i>
+
+    This kind of location list entry provides an operation expression that
+    evaluates to the location description of an object that is valid over a
+    lifetime bounded by a starting and ending address. The starting address is
+    the lowest address of the address range over which the location is valid.
+    The ending address is the address of the first location past the highest
+    address of the address range.
+
+    The location list entry matches when the current program location is within
+    the given range.
+
+    There are several kinds of bounded location description entries which differ
+    in the way that they specify the starting and ending addresses.
+
+2.  <i>Default location description</i>
+
+    This kind of location list entry provides an operation expression that
+    evaluates to the location description of an object that is valid when no
+    bounded location description entry applies.
+
+    The location list entry matches when the current program location is not
+    within the range of any bounded location description entry.
+
+3.  <i>Base address</i>
+
+    This kind of location list entry provides an address to be used as the base
+    address for beginning and ending address offsets given in certain kinds of
+    bounded location description entries. The applicable base address of a
+    bounded location description entry is the address specified by the closest
+    preceding base address entry in the same location list. If there is no
+    preceding base address entry, then the applicable base address defaults to
+    the base address of the compilation unit (see DWARF Version 5 section
+    3.1.1).
+
+    In the case of a compilation unit where all of the machine code is contained
+    in a single contiguous section, no base address entry is needed.
+
+4.  <i>End-of-list</i>
+
+    This kind of location list entry marks the end of the location list
+    expression.
+
+The address ranges defined by the bounded location description entries of a
+location list expression may overlap. When they do, they describe a situation in
+which an object exists simultaneously in more than one place.
+
+If all of the address ranges in a given location list expression do not
+collectively cover the entire range over which the object in question is
+defined, and there is no following default location description entry, it is
+assumed that the object is not available for the portion of the range that is
+not covered.
+
+The result of the evaluation of a DWARF location list expression is:
+
+- If the current program location is not specified, then it is an evaluation
+  error.
+
+  > NOTE: If the location list only has a single default entry, should that be
+  > considered a match if there is no program location? If there are non-default
+  > entries then it seems it has to be an evaluation error when there is no
+  > program location as that indicates the location depends on the program
+  > location which is not known.
+
+- If there are no matching location list entries, then the result is a location
+  description that comprises one undefined location description.
+- Otherwise, the operation expression E of each matching location list entry is
+  evaluated with the current context, except that the result kind is a location
+  description, the object is unspecified, and the initial stack is empty. The
+  location list entry result is the location description returned by the
+  evaluation of E.
+
+  The result is a location description that is comprised of the union of the
+  single location descriptions of the location description result of each
+  matching location list entry.
+
+A location list expression can only be used as the value of a debugger
+information entry attribute that is encoded using class `loclist` or
+`loclistsptr` (see [7.5.5 Classes and Forms](#classes-and-forms)). The value of
+the attribute provides an index into a separate object file section called
+`.debug_loclists` or `.debug_loclists.dwo` (for split DWARF object files) that
+contains the location list entries.
+
+A `DW_OP_call*` and `DW_OP_implicit_pointer` operation can be used to specify a
+debugger information entry attribute that has a location list expression.
+Several debugger information entry attributes allow DWARF expressions that are
+evaluated with an initial stack that includes a location description that may
+originate from the evaluation of a location list expression.
+
+<i>This location list representation, the `loclist` and `loclistsptr` class, and
+the related `DW_AT_loclists_base` attribute are new in DWARF Version 5. Together
+they eliminate most, or all of the code object relocations previously needed for
+location list expressions.</i>
+
+> NOTE: The rest of this section is the same as DWARF Version 5 section 2.6.2.
+
+## A.3 Program Scope Entries
+
+> NOTE: This section provides changes to existing debugger information entry
+> attributes. These would be incorporated into the corresponding DWARF Version 5
+> chapter 3 sections.
+
+### A.3.3 Subroutine and Entry Point Entries
+
+#### A.3.3.5 Low-Level Information
+
+1.  A `DW_TAG_subprogram`, `DW_TAG_inlined_subroutine`, or `DW_TAG_entry_point`
+    debugger information entry may have a `DW_AT_return_addr` attribute, whose
+    value is a DWARF expression E.
+
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an unspecified object, the
+    compilation unit that contains E, an empty initial stack, and other context
+    elements corresponding to the source language thread of execution upon which
+    the user is focused, if any. The result of the evaluation is the location
+    description L of the place where the return address for the current call
+    frame's subprogram or entry point is stored.
+
+    The DWARF is ill-formed if L is not comprised of one memory location
+    description for one of the target architecture specific address spaces.
+
+    > NOTE: It is unclear why `DW_TAG_inlined_subroutine` has a
+    > `DW_AT_return_addr` attribute but not a `DW_AT_frame_base` or
+    > `DW_AT_static_link` attribute. Seems it would either have all of them or
+    > none. Since inlined subprograms do not have a call frame it seems they
+    > would have none of these attributes.
+
+2.  A `DW_TAG_subprogram` or `DW_TAG_entry_point` debugger information entry may
+    have a `DW_AT_frame_base` attribute, whose value is a DWARF expression E.
+
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an unspecified object, the
+    compilation unit that contains E, an empty initial stack, and other context
+    elements corresponding to the source language thread of execution upon which
+    the user is focused, if any.
+
+    The DWARF is ill-formed if E contains an `DW_OP_fbreg` operation, or the
+    resulting location description L is not comprised of one single location
+    description SL.
+
+    If SL is a register location description for register R, then L is replaced
+    with the result of evaluating a `DW_OP_bregx R, 0` operation. This computes
+    the frame base memory location description in the target architecture
+    default address space.
+
+    <i>This allows the more compact `DW_OP_reg*` to be used instead of
+    `DW_OP_breg* 0`.</i>
+
+    > NOTE: This rule could be removed and require the producer to create the
+    > required location description directly using `DW_OP_call_frame_cfa` or
+    > `DW_OP_breg*`. This would also then allow a target to implement the call
+    > frames within a large register.
+
+    Otherwise, the DWARF is ill-formed if SL is not a memory location
+    description in any of the target architecture specific address spaces.
+
+    The resulting L is the <i>frame base</i> for the subprogram or entry point.
+
+    <i>Typically, E will use the `DW_OP_call_frame_cfa` operation or be a stack
+    pointer register plus or minus some offset.</i>
+
+3.  If a `DW_TAG_subprogram` or `DW_TAG_entry_point` debugger information entry
+    is lexically nested, it may have a `DW_AT_static_link` attribute, whose
+    value is a DWARF expression E.
+
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an unspecified object, the
+    compilation unit that contains E, an empty initial stack, and other context
+    elements corresponding to the source language thread of execution upon which
+    the user is focused, if any. The result of the evaluation is the location
+    description L of the <i>canonical frame address</i> (see [6.4 Call Frame
+    Information](#call-frame-information)) of the relevant call frame of the
+    subprogram instance that immediately lexically encloses the current call
+    frame's subprogram or entry point.
+
+    The DWARF is ill-formed if L is is not comprised of one memory location
+    description for one of the target architecture specific address spaces.
+
+### A.3.4 Call Site Entries and Parameters
+
+#### A.3.4.2 Call Site Parameters
+
+1.  A `DW_TAG_call_site_parameter` debugger information entry may have a
+    `DW_AT_call_value` attribute, whose value is a DWARF operation expression
+    E<sub>1</sub>.
+
+    The result of the `DW_AT_call_value` attribute is obtained by evaluating
+    E<sub>1</sub> with a context that has a result kind of a value, an unspecified
+    object, the compilation unit that contains E, an empty initial stack, and other
+    context elements corresponding to the source language thread of execution upon
+    which the user is focused, if any. The resulting value V<sub>1</sub> is the
+    value of the parameter at the time of the call made by the call site.
+
+    For parameters passed by reference, where the code passes a pointer to a
+    location which contains the parameter, or for reference type parameters, the
+    `DW_TAG_call_site_parameter` debugger information entry may also have a
+    `DW_AT_call_data_location` attribute whose value is a DWARF operation expression
+    E<sub>2</sub>, and a `DW_AT_call_data_value` attribute whose value is a DWARF
+    operation expression E<sub>3</sub>.
+
+    The value of the `DW_AT_call_data_location` attribute is obtained by evaluating
+    E<sub>2</sub> with a context that has a result kind of a location description,
+    an unspecified object, the compilation unit that contains E, an empty initial
+    stack, and other context elements corresponding to the source language thread of
+    execution upon which the user is focused, if any. The resulting location
+    description L<sub>2</sub> is the location where the referenced parameter lives
+    during the call made by the call site. If E<sub>2</sub> would just be a
+    `DW_OP_push_object_address`, then the `DW_AT_call_data_location` attribute may
+    be omitted.
+
+    > NOTE: The DWARF Version 5 implies that `DW_OP_push_object_address` may be
+    > used but does not state what object must be specified in the context.
+    > Either `DW_OP_push_object_address` cannot be used, or the object to be
+    > passed in the context must be defined.
+
+    The value of the `DW_AT_call_data_value` attribute is obtained by evaluating
+    E<sub>3</sub> with a context that has a result kind of a value, an unspecified
+    object, the compilation unit that contains E, an empty initial stack, and other
+    context elements corresponding to the source language thread of execution upon
+    which the user is focused, if any. The resulting value V<sub>3</sub> is the
+    value in L<sub>2</sub> at the time of the call made by the call site.
+
+    The result of these attributes is undefined if the current call frame is not for
+    the subprogram containing the `DW_TAG_call_site_parameter` debugger information
+    entry or the current program location is not for the call site containing the
+    `DW_TAG_call_site_parameter` debugger information entry in the current call
+    frame.
+
+    <i>The consumer may have to virtually unwind to the call site (see [6.4 Call
+    Frame Information](#call-frame-information)) in order to evaluate these
+    attributes. This will ensure the source language thread of execution upon which
+    the user is focused corresponds to the call site needed to evaluate the
+    expression.</i>
+
+    If it is not possible to avoid the expressions of these attributes from
+    accessing registers or memory locations that might be clobbered by the
+    subprogram being called by the call site, then the associated attribute should
+    not be provided.
+
+    <i>The reason for the restriction is that the parameter may need to be accessed
+    during the execution of the callee. The consumer may virtually unwind from the
+    called subprogram back to the caller and then evaluate the attribute
+    expressions. The call frame information (see [6.4 Call Frame
+    Information](#call-frame-information)) will not be able to restore registers
+    that have been clobbered, and clobbered memory will no longer have the value at
+    the time of the call.</i>
+
+### A.3.5 Lexical Block Entries
+
+> NOTE: This section is the same as DWARF Version 5 section 3.5.
+
+## A.4 Data Object and Object List Entries
+
+> NOTE: This section provides changes to existing debugger information entry
+> attributes. These would be incorporated into the corresponding DWARF Version 5
+> chapter 4 sections.
+
+### A.4.1 Data Object Entries
+
+1.  Any debugging information entry describing a data object (which includes
+    variables and parameters) or common blocks may have a `DW_AT_location`
+    attribute, whose value is a DWARF expression E.
+
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an unspecified object, the
+    compilation unit that contains E, an empty initial stack, and other context
+    elements corresponding to the source language thread of execution upon which
+    the user is focused, if any. The result of the evaluation is the location
+    description of the base of the data object.
+
+    See [2.5.4.2 Control Flow Operations](#control-flow-operations) for special
+    evaluation rules used by the `DW_OP_call*` operations.
+
+    > NOTE: Delete the description of how the `DW_OP_call*` operations evaluate
+    > a `DW_AT_location` attribute as that is now described in the operations.
+
+    > NOTE: See the discussion about the `DW_AT_location` attribute in the
+    > `DW_OP_call*` operation. Having each attribute only have a single purpose
+    > and single execution semantics seems desirable. It makes it easier for the
+    > consumer that no longer have to track the context. It makes it easier for
+    > the producer as it can rely on a single semantics for each attribute.
+    >
+    > For that reason, limiting the `DW_AT_location` attribute to only
+    > supporting evaluating the location description of an object, and using a
+    > different attribute and encoding class for the evaluation of DWARF
+    > expression <i>procedures</i> on the same operation expression stack seems
+    > desirable.
+
+2.  `DW_AT_const_value`
+
+    > NOTE: Could deprecate using the `DW_AT_const_value` attribute for
+    > `DW_TAG_variable` or `DW_TAG_formal_parameter` debugger information
+    > entries that have been optimized to a constant. Instead, `DW_AT_location`
+    > could be used with a DWARF expression that produces an implicit location
+    > description now that any location description can be used within a DWARF
+    > expression. This allows the `DW_OP_call*` operations to be used to push
+    > the location description of any variable regardless of how it is
+    > optimized.
+
+## A.5 Type Entries
+
+> NOTE: This section provides changes to existing debugger information entry
+> attributes. These would be incorporated into the corresponding DWARF Version 5
+> chapter 5 sections.
+
+### A.5.7 Structure, Union, Class and Interface Type Entries
+
+#### A.5.7.3 Derived or Extended Structures, Classes and Interfaces
+
+1.  For a `DW_AT_data_member_location` attribute there are two cases:
+
+    1.  If the attribute is an integer constant B, it provides the offset in
+        bytes from the beginning of the containing entity.
+
+        The result of the attribute is obtained by updating the bit offset of
+        the location description of the beginning of the containing entity by B
+        scaled by 8 (the byte size). The result is the location description of
+        the base of the member entry.
+
+        <i>If the beginning of the containing entity is not byte aligned, then
+        the beginning of the member entry has the same bit displacement within a
+        byte.</i>
+
+    2.  Otherwise, the attribute must be a DWARF expression E which is evaluated
+        with a context that has a result kind of a location description, an
+        unspecified object, the compilation unit that contains E, an initial
+        stack comprising the location description of the beginning of the
+        containing entity, and other context elements corresponding to the
+        source language thread of execution upon which the user is focused, if
+        any. The result of the evaluation is the location description of the
+        base of the member entry.
+
+    > NOTE: The beginning of the containing entity can now be any location
+    > description, including those with more than one single location
+    > description, and those with single location descriptions that are of any
+    > kind and have any bit offset.
+
+#### A.5.7.8 Member Function Entries
+
+1.  An entry for a virtual function also has a `DW_AT_vtable_elem_location`
+    attribute whose value is a DWARF expression E.
+
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an unspecified object, the
+    compilation unit that contains E, an initial stack comprising the location
+    description of the object of the enclosing type, and other context elements
+    corresponding to the source language thread of execution upon which the user
+    is focused, if any. The result of the evaluation is the location description
+    of the slot for the function within the virtual function table for the
+    enclosing class.
+
+### A.5.14 Pointer to Member Type Entries
+
+1.  The `DW_TAG_ptr_to_member_type` debugging information entry has a
+    `DW_AT_use_location` attribute whose value is a DWARF expression E. It is used
+    to compute the location description of the member of the class to which the
+    pointer to member entry points.
+
+    <i>The method used to find the location description of a given member of a
+    class, structure, or union is common to any instance of that class, structure,
+    or union and to any instance of the pointer to member type. The method is thus
+    associated with the pointer to member type, rather than with each object that
+    has a pointer to member type.</i>
+
+    The `DW_AT_use_location` DWARF expression is used in conjunction with the
+    location description for a particular object of the given pointer to member type
+    and for a particular structure or class instance.
+
+    The result of the attribute is obtained by evaluating E with a context that has
+    a result kind of a location description, an unspecified object, the compilation
+    unit that contains E, an initial stack comprising two entries, and other context
+    elements corresponding to the source language thread of execution upon which the
+    user is focused, if any. The first stack entry is the value of the pointer to
+    member object itself. The second stack entry is the location description of the
+    base of the entire class, structure, or union instance containing the member
+    whose location is being calculated. The result of the evaluation is the location
+    description of the member of the class to which the pointer to member entry
+    points.
+
+### A.5.16 Dynamic Type Entries
+
+1.  The `DW_AT_data_location` attribute may be used with any type that provides one
+    or more levels of hidden indirection and/or run-time parameters in its
+    representation. Its value is a DWARF operation expression E which computes the
+    location description of the data for an object. When this attribute is omitted,
+    the location description of the data is the same as the location description of
+    the object.
+
+    The result of the attribute is obtained by evaluating E with a context that has
+    a result kind of a location description, an object that is the location
+    description of the data descriptor, the compilation unit that contains E, an
+    empty initial stack, and other context elements corresponding to the source
+    language thread of execution upon which the user is focused, if any. The result
+    of the evaluation is the location description of the base of the member entry.
+
+    <i>E will typically involve an operation expression that begins with a
+    `DW_OP_push_object_address` operation which loads the location description
+    of the object which can then serve as a descriptor in subsequent
+    calculation.</i>
+
+    > NOTE: Since `DW_AT_data_member_location`, `DW_AT_use_location`, and
+    > `DW_AT_vtable_elem_location` allow both operation expressions and location
+    > list expressions, why does `DW_AT_data_location` not allow both? In all cases
+    > they apply to data objects so less likely that optimization would cause
+    > different operation expressions for different program location ranges. But if
+    > supporting for some then should be for all.
+    >
+    > It seems odd this attribute is not the same as `DW_AT_data_member_location` in
+    > having an initial stack with the location description of the object since the
+    > expression has to need it.
+
+## A.6 Other Debugging Information
+
+> NOTE: This section provides changes to existing debugger information entry
+> attributes. These would be incorporated into the corresponding DWARF Version 5
+> chapter 6 sections.
+
+### A.6.2 Line Number Information
+
+> NOTE: This section is the same as DWARF Version 5 section 6.2.
+
+### A.6.4 Call Frame Information
+
+> NOTE: This section provides changes to DWARF Version 5 section 6.4. Register
+> unwind DWARF expressions are generalized to allow any location description,
+> including those with composite and implicit location descriptions.
+
+#### A.6.4.1 Structure of Call Frame Information
+
+The register rules are:
+
+1.  <i>undefined</i>
+
+    A register that has this rule has no recoverable value in the previous
+    frame. The previous value of this register is the undefined location
+    description (see [2.5.4.4.2 Undefined Location Description
+    Operations](#undefined-location-description-operations)).
+
+    <i>By convention, the register is not preserved by a callee.</i>
+
+2.  <i>same value</i>
+
+    This register has not been modified from the previous caller frame.
+
+    If the current frame is the top frame, then the previous value of this
+    register is the location description L that specifies one register location
+    description SL. SL specifies the register location storage that corresponds
+    to the register with a bit offset of 0 for the current thread.
+
+    If the current frame is not the top frame, then the previous value of this
+    register is the location description obtained using the call frame
+    information for the callee frame and callee program location invoked by the
+    current caller frame for the same register.
+
+    <i>By convention, the register is preserved by the callee, but the callee
+    has not modified it.</i>
+
+3.  <i>offset(N)</i>
+
+    N is a signed byte offset. The previous value of this register is saved at
+    the location description L. Where L is the location description of the
+    current CFA (see [2.5.4 DWARF Operation
+    Expressions](#dwarf-operation-expressions)) updated with the bit offset N
+    scaled by 8 (the byte size).
+
+4.  <i>val_offset(N)</i>
+
+    N is a signed byte offset. The previous value of this register is the memory
+    byte address of the location description L. Where L is the location
+    description of the current CFA (see [2.5.4 DWARF Operation
+    Expressions](#dwarf-operation-expressions)) updated with the bit offset N
+    scaled by 8 (the byte size).
+
+    The DWARF is ill-formed if the CFA location description is not a memory byte
+    address location description, or if the register size does not match the
+    size of an address in the target architecture default address space.
+
+    <i>Since the CFA location description is required to be a memory byte
+    address location description, the value of val_offset(N) will also be a
+    memory byte address location description since it is offsetting the CFA
+    location description by N bytes. Furthermore, the value of val_offset(N)
+    will be a memory byte address in the target architecture default address
+    space.</i>
+
+    > NOTE: Should DWARF allow the address size to be a different size to the
+    > size of the register? Requiring them to be the same bit size avoids any
+    > issue of conversion as the bit contents of the register is simply
+    > interpreted as a value of the address.
+    >
+    > GDB has a per register hook that allows a target specific conversion on a
+    > register by register basis. It defaults to truncation of bigger registers,
+    > and to actually reading bytes from the next register (or reads out of
+    > bounds for the last register) for smaller registers. There are no GDB
+    > tests that read a register out of bounds (except an illegal hand written
+    > assembly test).
+
+5.  <i>register(R)</i>
+
+    This register has been stored in another register numbered R.
+
+    The previous value of this register is the location description obtained
+    using the call frame information for the current frame and current program
+    location for register R.
+
+    The DWARF is ill-formed if the size of this register does not match the size
+    of register R or if there is a cyclic dependency in the call frame
+    information.
+
+    > NOTE: Should this also allow R to be larger than this register? If so is
+    > the value stored in the low order bits and it is undefined what is stored
+    > in the extra upper bits?
+
+6.  <i>expression(E)</i>
+
+    The previous value of this register is located at the location description
+    produced by evaluating the DWARF operation expression E (see [2.5.4 DWARF
+    Operation Expressions](#dwarf-operation-expressions)).
+
+    E is evaluated with the current context, except the result kind is a
+    location description, the compilation unit is unspecified, the object is
+    unspecified, and an initial stack comprising the location description of the
+    current CFA (see [2.5.4 DWARF Operation
+    Expressions](#dwarf-operation-expressions)).
+
+7.  <i>val_expression(E)</i>
+
+    The previous value of this register is the value produced by evaluating the
+    DWARF operation expression E (see [2.5.4 DWARF Operation
+    Expressions](#dwarf-operation-expressions)).
+
+    E is evaluated with the current context, except the result kind is a value,
+    the compilation unit is unspecified, the object is unspecified, and an
+    initial stack comprising the location description of the current CFA (see
+    [2.5.4 DWARF Operation Expressions](#dwarf-operation-expressions)).
+
+    The DWARF is ill-formed if the resulting value type size does not match the
+    register size.
+
+    > NOTE: This has limited usefulness as the DWARF expression E can only
+    > produce values up to the size of the generic type. This is due to not
+    > allowing any operations that specify a type in a CFI operation expression.
+    > This makes it unusable for registers that are larger than the generic
+    > type. However, <i>expression(E)</i> can be used to create an implicit
+    > location description of any size.
+
+8.  <i>architectural</i>
+
+    The rule is defined externally to this specification by the augmenter.
+
+A Common Information Entry (CIE) holds information that is shared among many
+Frame Description Entries (FDE). There is at least one CIE in every non-empty
+`.debug_frame` section. A CIE contains the following fields, in order:
+
+1.  `length` (initial length)
+
+    A constant that gives the number of bytes of the CIE structure, not
+    including the length field itself. The size of the length field plus the
+    value of length must be an integral multiple of the address size specified
+    in the `address_size` field.
+
+2.  `CIE_id` (4 or 8 bytes, see [7.4 32-Bit and 64-Bit DWARF
+    Formats](#32-bit-and-64-bit-dwarf-formats))
+
+    A constant that is used to distinguish CIEs from FDEs.
+
+    In the 32-bit DWARF format, the value of the CIE id in the CIE header is
+    0xffffffff; in the 64-bit DWARF format, the value is 0xffffffffffffffff.
+
+3.  `version` (ubyte)
+
+    A version number. This number is specific to the call frame information and
+    is independent of the DWARF version number.
+
+    The value of the CIE version number is 4.
+
+    > NOTE: Would this be increased to 5 to reflect the changes in these
+    > extensions?
+
+4.  `augmentation` (sequence of UTF-8 characters)
+
+    A null-terminated UTF-8 string that identifies the augmentation to this CIE
+    or to the FDEs that use it. If a reader encounters an augmentation string
+    that is unexpected, then only the following fields can be read:
+
+    - CIE: length, CIE_id, version, augmentation
+    - FDE: length, CIE_pointer, initial_location, address_range
+
+    If there is no augmentation, this value is a zero byte.
+
+    <i>The augmentation string allows users to indicate that there is additional
+    vendor and target architecture specific information in the CIE or FDE which
+    is needed to virtually unwind a stack frame. For example, this might be
+    information about dynamically allocated data which needs to be freed on exit
+    from the routine.</i>
+
+    <i>Because the `.debug_frame` section is useful independently of any
+    `.debug_info` section, the augmentation string always uses UTF-8
+    encoding.</i>
+
+5.  `address_size` (ubyte)
+
+    The size of a target address in this CIE and any FDEs that use it, in bytes.
+    If a compilation unit exists for this frame, its address size must match the
+    address size here.
+
+6.  `segment_selector_size` (ubyte)
+
+    The size of a segment selector in this CIE and any FDEs that use it, in
+    bytes.
+
+7.  `code_alignment_factor` (unsigned LEB128)
+
+    A constant that is factored out of all advance location instructions (see
+    [6.4.2.1 Row Creation Instructions](#row-creation-instructions)). The
+    resulting value is `(operand * code_alignment_factor)`.
+
+8.  `data_alignment_factor` (signed LEB128)
+
+    A constant that is factored out of certain offset instructions (see [6.4.2.2
+    CFA Definition Instructions](#cfa-definition-instructions) and [6.4.2.3
+    Register Rule Instructions](#register-rule-instructions)). The
+    resulting value is `(operand * data_alignment_factor)`.
+
+9.  `return_address_register` (unsigned LEB128)
+
+    An unsigned LEB128 constant that indicates which column in the rule table
+    represents the return address of the subprogram. Note that this column might
+    not correspond to an actual machine register.
+
+    The value of the return address register is used to determine the program
+    location of the caller frame. The program location of the top frame is the
+    target architecture program counter value of the current thread.
+
+10. `initial_instructions` (array of ubyte)
+
+    A sequence of rules that are interpreted to create the initial setting of
+    each column in the table.
+
+    The default rule for all columns before interpretation of the initial
+    instructions is the undefined rule. However, an ABI authoring body or a
+    compilation system authoring body may specify an alternate default value for
+    any or all columns.
+
+11. `padding` (array of ubyte)
+
+    Enough `DW_CFA_nop` instructions to make the size of this entry match the
+    length value above.
+
+An FDE contains the following fields, in order:
+
+1.  `length` (initial length)
+
+    A constant that gives the number of bytes of the header and instruction
+    stream for this subprogram, not including the length field itself. The size
+    of the length field plus the value of length must be an integral multiple of
+    the address size.
+
+2.  `CIE_pointer` (4 or 8 bytes, see [7.4 32-Bit and 64-Bit DWARF
+    Formats](#32-bit-and-64-bit-dwarf-formats))
+
+    A constant offset into the `.debug_frame` section that denotes the CIE that
+    is associated with this FDE.
+
+3.  `initial_location` (segment selector and target address)
+
+    The address of the first location associated with this table entry. If the
+    segment_selector_size field of this FDE's CIE is non-zero, the initial
+    location is preceded by a segment selector of the given length.
+
+4.  `address_range` (target address)
+
+    The number of bytes of program instructions described by this entry.
+
+5.  `instructions` (array of ubyte)
+
+    A sequence of table defining instructions that are described in [6.4.2 Call
+    Frame Instructions](#call-frame-instructions).
+
+6.  `padding` (array of ubyte)
+
+    Enough `DW_CFA_nop` instructions to make the size of this entry match the
+    length value above.
+
+#### A.6.4.2 Call Frame Instructions
+
+Some call frame instructions have operands that are encoded as DWARF operation
+expressions E (see [2.5.4 DWARF Operation
+Expressions](#dwarf-operation-expressions)). The DWARF operations that can be
+used in E have the following restrictions:
+
+- `DW_OP_addrx`, `DW_OP_call2`, `DW_OP_call4`, `DW_OP_call_ref`,
+  `DW_OP_const_type`, `DW_OP_constx`, `DW_OP_convert`, `DW_OP_deref_type`,
+  `DW_OP_fbreg`, `DW_OP_implicit_pointer`, `DW_OP_regval_type`,
+  `DW_OP_reinterpret`, and `DW_OP_xderef_type` operations are not allowed
+  because the call frame information must not depend on other debug sections.
+- `DW_OP_push_object_address` is not allowed because there is no object context
+  to provide a value to push.
+- `DW_OP_call_frame_cfa` and `DW_OP_entry_value` are not allowed because their
+  use would be circular.
+
+<i>Call frame instructions to which these restrictions apply include
+`DW_CFA_def_cfa_expression`, `DW_CFA_expression`, and
+`DW_CFA_val_expression`.</i>
+
+##### A.6.4.2.1 Row Creation Instructions
+
+> NOTE: These instructions are the same as in DWARF Version 5 section 6.4.2.1.
+
+##### A.6.4.2.2 CFA Definition Instructions
+
+1.  `DW_CFA_def_cfa`
+
+    The `DW_CFA_def_cfa` instruction takes two unsigned LEB128 operands
+    representing a register number R and a (non-factored) byte displacement B.
+    The required action is to define the current CFA rule to be the result of
+    evaluating the DWARF operation expression `DW_OP_bregx R, B` as a location
+    description.
+
+2.  `DW_CFA_def_cfa_sf`
+
+    The `DW_CFA_def_cfa_sf` instruction takes two operands: an unsigned LEB128
+    value representing a register number R and a signed LEB128 factored byte
+    displacement B. The required action is to define the current CFA rule to be
+    the result of evaluating the DWARF operation expression `DW_OP_bregx R, B *
+    data_alignment_factor` as a location description.
+
+    <i>The action is the same as `DW_CFA_def_cfa`, except that the second
+    operand is signed and factored.</i>
+
+3.  `DW_CFA_def_cfa_register`
+
+    The `DW_CFA_def_cfa_register` instruction takes a single unsigned LEB128
+    operand representing a register number R. The required action is to define
+    the current CFA rule to be the result of evaluating the DWARF operation
+    expression `DW_OP_bregx R, B` as a location description. B is the old CFA
+    byte displacement.
+
+    If the subprogram has no current CFA rule, or the rule was defined by a
+    `DW_CFA_def_cfa_expression` instruction, then the DWARF is ill-formed.
+
+4.  `DW_CFA_def_cfa_offset`
+
+    The `DW_CFA_def_cfa_offset` instruction takes a single unsigned LEB128
+    operand representing a (non-factored) byte displacement B. The required
+    action is to define the current CFA rule to be the result of evaluating the
+    DWARF operation expression `DW_OP_bregx R, B` as a location description. R
+    is the old CFA register number.
+
+    If the subprogram has no current CFA rule, or the rule was defined by a
+    `DW_CFA_def_cfa_expression` instruction, then the DWARF is ill-formed.
+
+5.  `DW_CFA_def_cfa_offset_sf`
+
+    The `DW_CFA_def_cfa_offset_sf` instruction takes a signed LEB128 operand
+    representing a factored byte displacement B. The required action is to
+    define the current CFA rule to be the result of evaluating the DWARF
+    operation expression `DW_OP_bregx R, B * data_alignment_factor` as a
+    location description. R is the old CFA register number.
+
+    If the subprogram has no current CFA rule, or the rule was defined by a
+    `DW_CFA_def_cfa_expression` instruction, then the DWARF is ill-formed.
+
+    <i>The action is the same as `DW_CFA_def_cfa_offset`, except that the
+    operand is signed and factored.</i>
+
+6.  `DW_CFA_def_cfa_expression`
+
+    The `DW_CFA_def_cfa_expression` instruction takes a single operand encoded
+    as a `DW_FORM_exprloc` value representing a DWARF operation expression E.
+    The required action is to define the current CFA rule to be the result of
+    evaluating E with the current context, except the result kind is a location
+    description, the compilation unit is unspecified, the object is unspecified,
+    and an empty initial stack.
+
+    <i>See [6.4.2 Call Frame Instructions](#call-frame-instructions) regarding
+    restrictions on the DWARF expression operations that can be used in E.</i>
+
+    The DWARF is ill-formed if the result of evaluating E is not a memory byte
+    address location description.
+
+##### A.6.4.2.3 Register Rule Instructions
+
+1.  `DW_CFA_undefined`
+
+    The `DW_CFA_undefined` instruction takes a single unsigned LEB128 operand
+    that represents a register number R. The required action is to set the rule
+    for the register specified by R to `undefined`.
+
+2.  `DW_CFA_same_value`
+
+    The `DW_CFA_same_value` instruction takes a single unsigned LEB128 operand
+    that represents a register number R. The required action is to set the rule
+    for the register specified by R to `same value`.
+
+3.  `DW_CFA_offset`
+
+    The `DW_CFA_offset` instruction takes two operands: a register number R
+    (encoded with the opcode) and an unsigned LEB128 constant representing a
+    factored displacement B. The required action is to change the rule for the
+    register specified by R to be an <i>offset(B * data_alignment_factor)</i>
+    rule.
+
+    > NOTE: Seems this should be named `DW_CFA_offset_uf` since the offset is
+    > unsigned factored.
+
+4.  `DW_CFA_offset_extended`
+
+    The `DW_CFA_offset_extended` instruction takes two unsigned LEB128 operands
+    representing a register number R and a factored displacement B. This
+    instruction is identical to `DW_CFA_offset`, except for the encoding and
+    size of the register operand.
+
+    > NOTE: Seems this should be named `DW_CFA_offset_extended_uf` since the
+    > displacement is unsigned factored.
+
+5.  `DW_CFA_offset_extended_sf`
+
+    The `DW_CFA_offset_extended_sf` instruction takes two operands: an unsigned
+    LEB128 value representing a register number R and a signed LEB128 factored
+    displacement B. This instruction is identical to `DW_CFA_offset_extended`,
+    except that B is signed.
+
+6.  `DW_CFA_val_offset`
+
+    The `DW_CFA_val_offset` instruction takes two unsigned LEB128 operands
+    representing a register number R and a factored displacement B. The required
+    action is to change the rule for the register indicated by R to be a
+    <i>val_offset(B * data_alignment_factor)</i> rule.
+
+    > NOTE: Seems this should be named `DW_CFA_val_offset_uf` since the
+    displacement is unsigned factored.
+
+7.  `DW_CFA_val_offset_sf`
+
+    The `DW_CFA_val_offset_sf` instruction takes two operands: an unsigned
+    LEB128 value representing a register number R and a signed LEB128 factored
+    displacement B. This instruction is identical to `DW_CFA_val_offset`, except
+    that B is signed.
+
+8.  `DW_CFA_register`
+
+    The `DW_CFA_register` instruction takes two unsigned LEB128 operands
+    representing register numbers R1 and R2 respectively. The required action is
+    to set the rule for the register specified by R1 to be a <i>register(R2)</i>
+    rule.
+
+9.  `DW_CFA_expression`
+
+    The `DW_CFA_expression` instruction takes two operands: an unsigned LEB128
+    value representing a register number R, and a `DW_FORM_block` value
+    representing a DWARF operation expression E. The required action is to
+    change the rule for the register specified by R to be an
+    <i>expression(E)</i> rule.
+
+    <i>That is, E computes the location description where the register value can
+    be retrieved.</i>
+
+    <i>See [6.4.2 Call Frame Instructions](#call-frame-instructions) regarding
+    restrictions on the DWARF expression operations that can be used in E.</i>
+
+10. `DW_CFA_val_expression`
+
+    The `DW_CFA_val_expression` instruction takes two operands: an unsigned
+    LEB128 value representing a register number R, and a `DW_FORM_block` value
+    representing a DWARF operation expression E. The required action is to
+    change the rule for the register specified by R to be a
+    <i>val_expression(E)</i> rule.
+
+    <i>That is, E computes the value of register R.</i>
+
+    <i>See [6.4.2 Call Frame Instructions](#call-frame-instructions) regarding
+    restrictions on the DWARF expression operations that can be used in E.</i>
+
+    If the result of evaluating E is not a value with a base type size that
+    matches the register size, then the DWARF is ill-formed.
+
+11. `DW_CFA_restore`
+
+    The `DW_CFA_restore` instruction takes a single operand (encoded with the
+    opcode) that represents a register number R. The required action is to
+    change the rule for the register specified by R to the rule assigned it by
+    the `initial_instructions` in the CIE.
+
+12. `DW_CFA_restore_extended`
+
+    The `DW_CFA_restore_extended` instruction takes a single unsigned LEB128
+    operand that represents a register number R. This instruction is identical
+    to `DW_CFA_restore`, except for the encoding and size of the register
+    operand.
+
+##### A.6.4.2.4 Row State Instructions
+
+> NOTE: These instructions are the same as in DWARF Version 5 section 6.4.2.4.
+
+##### A.6.4.2.5 Padding Instruction
+
+> NOTE: These instructions are the same as in DWARF Version 5 section 6.4.2.5.
+
+#### A.6.4.3 Call Frame Instruction Usage
+
+> NOTE: The same as in DWARF Version 5 section 6.4.3.
+
+#### A.6.4.4 Call Frame Calling Address
+
+> NOTE: The same as in DWARF Version 5 section 6.4.4.
+
+## A.7 Data Representation
+
+> NOTE: This section provides changes to existing debugger information entry
+> attributes. These would be incorporated into the corresponding DWARF Version 5
+> chapter 7 sections.
+
+### A.7.4 32-Bit and 64-Bit DWARF Formats
+
+> NOTE: This augments DWARF Version 5 section 7.4 list item 3's table.
+
+    Form                     Role
+    ------------------------ --------------------------------------
+    DW_OP_implicit_pointer   offset in `.debug_info`
+
+### A.7.5 Format of Debugging Information
+
+#### A.7.5.5 Classes and Forms
+
+> NOTE: The same as in DWARF Version 5 section 7.5.5.
+
+### A.7.7 DWARF Expressions
+
+> NOTE: Rename DWARF Version 5 section 7.7 to reflect the unification of
+> location descriptions into DWARF expressions.
+
+#### A.7.7.1 Operation Expressions
+
+> NOTE: Rename DWARF Version 5 section 7.7.1 and delete section 7.7.2 to reflect
+> the unification of location descriptions into DWARF expressions.
+
+#### A.7.7.3 Location List Expressions
+
+> NOTE: Rename DWARF Version 5 section 7.7.3 to reflect that location lists are
+> a kind of DWARF expression.
+
+# B. Further Information
 
 The following references provide additional information on the extension.
 
+A reference to the DWARF standard is provided.
+
+A formatted version of this extension is available on the LLVM site. It includes
+many figures that help illustrate the textual description, especially of the
+example DWARF expression evaluations.
+
 Slides and a video of a presentation at the Linux Plumbers Conference 2021
 related to this extension are available.
 
-The LLVM compiler extension includes possible normative text changes for this
-extension as well as the operations mentioned in the motivating examples. It
-also covers other extensions needed for heterogeneous devices.
+The LLVM compiler extension includes the operations mentioned in the motivating
+examples. It also covers other extensions needed for heterogeneous devices.
 
+- [DWARF Debugging Information Format](https://dwarfstd.org/)
+  - [DWARF Debugging Information Format Version 5](https://dwarfstd.org/Dwarf5Std.php)
+- [Allow Location Descriptions on the DWARF Expression Stack](https://llvm.org/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.html)
 - DWARF extensions for optimized SIMT/SIMD (GPU) debugging - Linux Plumbers Conference 2021
   - [Video](https://www.youtube.com/watch?v=QiR0ra0ymEY&t=10015s)
   - [Slides](https://linuxplumbersconf.org/event/11/contributions/1012/attachments/798/1505/DWARF_Extensions_for_Optimized_SIMT-SIMD_GPU_Debugging-LPC2021.pdf)
diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
--- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
+++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
@@ -15,425 +15,583 @@
 
 .. _amdgpu-dwarf-introduction:
 
-Introduction
-============
+1. Introduction
+===============
 
 AMD [:ref:`AMD <amdgpu-dwarf-AMD>`] has been working on supporting heterogeneous
-computing through the AMD Radeon Open Compute Platform (ROCm) [:ref:`AMD-ROCm
-<amdgpu-dwarf-AMD-ROCm>`]. A heterogeneous computing program can be written in a
-high level language such as C++ or Fortran with OpenMP pragmas, OpenCL, or HIP
-(a portable C++ programming environment for heterogeneous computing [:ref:`HIP
+computing. A heterogeneous computing program can be written in a high level
+language such as C++ or Fortran with OpenMP pragmas, OpenCL, or HIP (a portable
+C++ programming environment for heterogeneous computing [:ref:`HIP
 <amdgpu-dwarf-HIP>`]). A heterogeneous compiler and runtime allows a program to
 execute on multiple devices within the same native process. Devices could
 include CPUs, GPUs, DSPs, FPGAs, or other special purpose accelerators.
 Currently HIP programs execute on systems with CPUs and GPUs.
 
-ROCm is fully open sourced and includes contributions to open source projects
-such as LLVM for compilation [:ref:`LLVM <amdgpu-dwarf-LLVM>`] and GDB for
-debugging [:ref:`GDB <amdgpu-dwarf-GDB>`], as well as collaboration with other
-third party projects such as the GCC compiler [:ref:`GCC <amdgpu-dwarf-GCC>`]
-and the Perforce TotalView HPC debugger [:ref:`Perforce-TotalView
+The AMD [:ref:`AMD <amdgpu-dwarf-AMD>`] ROCm platform [:ref:`AMD-ROCm
+<amdgpu-dwarf-AMD-ROCm>`] is an implementation of the industry standard for
+heterogeneous computing devices defined by the Heterogeneous System Architecture
+(HSA) Foundation [:ref:`HSA <amdgpu-dwarf-HSA>`]. It is open sourced and
+includes contributions to open source projects such as LLVM [:ref:`LLVM
+<amdgpu-dwarf-LLVM>`] for compilation and GDB for debugging [:ref:`GDB
+<amdgpu-dwarf-GDB>`].
+
+The LLVM compiler has upstream support for commercially available AMD GPU
+hardware (AMDGPU) [:ref:`AMDGPU-LLVM <amdgpu-dwarf-AMDGPU-LLVM>`]. The open
+source ROCgdb [:ref:`AMD-ROCgdb <amdgpu-dwarf-AMD-ROCgdb>`] GDB based debugger
+also has support for AMDGPU which is being upstreamed. Support for AMDGPU is
+also being added by third parties to the GCC [:ref:`GCC <amdgpu-dwarf-GCC>`]
+compiler and the Perforce TotalView HPC Debugger [:ref:`Perforce-TotalView
 <amdgpu-dwarf-Perforce-TotalView>`].
 
 To support debugging heterogeneous programs several features that are not
 provided by current DWARF Version 5 [:ref:`DWARF <amdgpu-dwarf-DWARF>`] have
-been identified. This document contains a collection of extensions to address
-providing those features.
-
-The :ref:`amdgpu-dwarf-motivation` section describes the issues that are being
-addressed for heterogeneous computing. That is followed by the
-:ref:`amdgpu-dwarf-changes-relative-to-dwarf-version-5` section containing the
+been identified. The :ref:`amdgpu-dwarf-extensions` section gives an overview of
+the extensions devised to address the missing features. The extensions seek to
+be general in nature and backwards compatible with DWARF Version 5. Their goal
+is to be applicable to meeting the needs of any heterogeneous system and not be
+vendor or architecture specific. That is followed by appendix
+:ref:`amdgpu-dwarf-changes-relative-to-dwarf-version-5` which contains the
 textual changes for the extensions relative to the DWARF Version 5 standard.
-Then there is an :ref:`amdgpu-dwarf-examples` section that links to the AMD GPU
-specific usage of the extensions that includes an example. Finally, there is a
-:ref:`amdgpu-dwarf-references` section. There are a number of notes included
-that raise open questions, or provide alternative approaches considered. The
-extensions seek to be general in nature and backwards compatible with DWARF
-Version 5. The goal is to be applicable to meeting the needs of any
-heterogeneous system and not be vendor or architecture specific.
-
-A fundamental aspect of the extensions is that it allows DWARF expression
-location descriptions as stack elements. The extensions are based on DWARF
-Version 5 and maintains compatibility with DWARF Version 5. After attempting
-several alternatives, the current thinking is that such extensions to DWARF
-Version 5 are the simplest and cleanest ways to support debugging optimized GPU
-code. It also appears to be generally useful and may be able to address other
-reported DWARF issues, as well as being helpful in providing better optimization
-support for non-GPU code.
-
-General feedback on these extensions is sought, together with suggestions on how
-to clarify, simplify, or organize them. If their is general interest then some
-or all of these extensions could be submitted as future DWARF proposals.
-
-We are in the process of modifying LLVM and GDB to support these extensions
-which is providing experience and insights. We plan to upstream the changes to
-those projects for any final form of the extensions.
-
-The author very much appreciates the input provided so far by many others which
-has been incorporated into this current version.
-
-.. _amdgpu-dwarf-motivation:
-
-Motivation
-==========
-
-This document presents a set of backwards compatible extensions to DWARF Version
-5 [:ref:`DWARF <amdgpu-dwarf-DWARF>`] to support heterogeneous debugging.
-
-The remainder of this section provides motivation for each extension in
-terms of heterogeneous debugging on commercially available AMD GPU hardware
-(AMDGPU). The goal is to add support to the AMD [:ref:`AMD <amdgpu-dwarf-AMD>`]
-open source Radeon Open Compute Platform (ROCm) [:ref:`AMD-ROCm
-<amdgpu-dwarf-AMD-ROCm>`] which is an implementation of the industry standard
-for heterogeneous computing devices defined by the Heterogeneous System
-Architecture (HSA) Foundation [:ref:`HSA <amdgpu-dwarf-HSA>`]. ROCm includes the
-LLVM compiler [:ref:`LLVM <amdgpu-dwarf-LLVM>`] with upstreamed support for
-AMDGPU [:ref:`AMDGPU-LLVM <amdgpu-dwarf-AMDGPU-LLVM>`]. The goal is to also add
-the GDB debugger [:ref:`GDB <amdgpu-dwarf-GDB>`] with upstreamed support for
-AMDGPU [:ref:`AMD-ROCgdb <amdgpu-dwarf-AMD-ROCgdb>`]. In addition, the goal is
-to work with third parties to enable support for AMDGPU debugging in the GCC
-compiler [:ref:`GCC <amdgpu-dwarf-GCC>`] and the Perforce TotalView HPC debugger
-[:ref:`Perforce-TotalView <amdgpu-dwarf-Perforce-TotalView>`].
-
-However, the extensions are intended to be vendor and architecture neutral. They
-are believed to apply to other heterogeneous hardware devices including GPUs,
-DSPs, FPGAs, and other specialized hardware. These collectively include similar
-characteristics and requirements as AMDGPU devices. Some of the extension can
-also apply to traditional CPU hardware that supports large vector registers.
-Compilers can map source languages and extensions that describe large scale
-parallel execution onto the lanes of the vector registers. This is common in
-programming languages used in ML and HPC. The extensions also include improved
-support for optimized code on any architecture. Some of the generalizations may
-also benefit other issues that have been raised.
-
-The extensions have evolved through collaboration with many individuals and
+There are a number of notes included that raise open questions, or provide
+alternative approaches that may be worth considering. Then appendix
+:ref:`amdgpu-dwarf-examples` links to the AMD GPU specific usage of the
+extensions that includes an example. Finally, appendix
+:ref:`amdgpu-dwarf-references` provides references to further information.
+
+.. _amdgpu-dwarf-extensions:
+
+1. Extensions
+=============
+
+The extensions continue to evolve through collaboration with many individuals and
 active prototyping within the GDB debugger and LLVM compiler. Input has also
 been very much appreciated from the developers working on the Perforce TotalView
 HPC Debugger and GCC compiler.
 
-The AMDGPU has several features that require additional DWARF functionality in
-order to support optimized code.
+The inputs provided and insights gained so far have been incorporated into this
+current version. The plan is to participate in upstreaming the work and
+addressing any feedback. If there is general interest then some or all of these
+extensions could be submitted as future DWARF standard proposals.
 
-AMDGPU optimized code may spill vector registers to non-global address space
-memory, and this spilling may be done only for lanes that are active on entry
-to the subprogram. To support this, a location description that can be created
-as a masked select is required. See ``DW_OP_LLVM_select_bit_piece``.
+The general principles in designing the extensions have been:
 
-Since the active lane mask may be held in a register, a way to get the value
-of a register on entry to a subprogram is required. To support this an
-operation that returns the caller value of a register as specified by the Call
-Frame Information (CFI) is required. See ``DW_OP_LLVM_call_frame_entry_reg``
-and :ref:`amdgpu-dwarf-call-frame-information`.
+1.  Be backwards compatible with the DWARF Version 5 [:ref:`DWARF
+    <amdgpu-dwarf-DWARF>`] standard.
 
-Current DWARF uses an empty expression to indicate an undefined location
-description. Since the masked select composite location description operation
-takes more than one location description, it is necessary to have an explicit
-way to specify an undefined location description. Otherwise it is not possible
-to specify that a particular one of the input location descriptions is
-undefined. See ``DW_OP_LLVM_undefined``.
+2.  Be vendor and architecture neutral. They are intended to apply to other
+    heterogeneous hardware devices including GPUs, DSPs, FPGAs, and other
+    specialized hardware. These collectively include similar characteristics and
+    requirements as AMDGPU devices.
+
+3.  Provide improved optimization support for non-GPU code. For example, some
+    extensions apply to traditional CPU hardware that supports large vector
+    registers. Compilers can map source languages, and source language
+    extensions, that describe large scale parallel execution, onto the lanes of
+    the vector registers. This is common in programming languages used in ML and
+    HPC.
+
+4.  Fully define well-formed DWARF in a consistent style based on the DWARF
+    Version 5 specification.
+
+It is possible that some of the generalizations may also benefit other DWARF
+issues that have been raised.
+
+The remainder of this section enumerates the extensions and provides motivation
+for each in terms of heterogeneous debugging.
+
+.. _amdgpu-dwarf-allow-location-description-on-the-dwarf-evaluation-stack:
+
+2.1 Allow Location Description on the DWARF Expression Stack
+------------------------------------------------------------
+
+DWARF Version 5 does not allow location descriptions to be entries on the DWARF
+expression stack. They can only be the final result of the evaluation of a DWARF
+expression. However, by allowing a location description to be a first-class
+entry on the DWARF expression stack it becomes possible to compose expressions
+containing both values and location descriptions naturally. It allows objects to
+be located in any kind of memory address space, in registers, be implicit
+values, be undefined, or a composite of any of these.
+
+By extending DWARF carefully, all existing DWARF expressions can retain their
+current semantic meaning. DWARF has implicit conversions that convert from a
+value that represents an address in the default address space to a memory
+location description. This can be extended to allow a default address space
+memory location description to be implicitly converted back to its address
+value. This allows all DWARF Version 5 expressions to retain their same meaning,
+while enabling the ability to explicitly create memory location descriptions in
+non-default address spaces and generalizing the power of composite location
+descriptions to any kind of location description.
+
+For those familiar with the definition of location descriptions in DWARF Version
+5, the definitions in these extensions are presented differently, but does in
+fact define the same concept with the same fundamental semantics. However, it
+does so in a way that allows the concept to extend to support address spaces,
+bit addressing, the ability for composite location descriptions to be composed
+of any kind of location description, and the ability to support objects located
+at multiple places. Collectively these changes expand the set of architectures
+that can be supported and improves support for optimized code.
+
+Several approaches were considered, and the one presented, together with the
+extensions it enables, appears to be the simplest and cleanest one that offers
+the greatest improvement of DWARF's ability to support debugging optimized GPU
+and non-GPU code. Examining the GDB debugger and LLVM compiler, it appears only
+to require modest changes as they both already have to support general use of
+location descriptions. It is anticipated that will also be the case for other
+debuggers and compilers.
+
+GDB has been modified to evaluate DWARF Version 5 expressions with location
+descriptions as stack entries and with implicit conversions. All GDB tests have
+passed, except one that turned out to be an invalid test case by DWARF Version 5
+rules. The code in GDB actually became simpler as all evaluation is done on a
+single stack and there was no longer a need to maintain a separate structure for
+the location description results. This gives confidence in backwards
+compatibility.
+
+See :ref:`amdgpu-dwarf-expressions` and nested sections.
+
+This extension is separately described at *Allow Location Descriptions on the
+DWARF Expression Stack* [:ref:`AMDGPU-DWARF-LOC
+<amdgpu-dwarf-AMDGPU-DWARF-LOC>`].
+
+2.2 Generalize CFI to Allow Any Location Description Kind
+---------------------------------------------------------
 
 CFI describes restoring callee saved registers that are spilled. Currently CFI
 only allows a location description that is a register, memory address, or
-implicit location description. AMDGPU optimized code may spill scalar
-registers into portions of vector registers. This requires extending CFI to
-allow any location description. See
-:ref:`amdgpu-dwarf-call-frame-information`.
+implicit location description. AMDGPU optimized code may spill scalar registers
+into portions of vector registers. This requires extending CFI to allow any
+location description kind to be supported.
 
-The vector registers of the AMDGPU are represented as their full wavefront
-size, meaning the wavefront size times the dword size. This reflects the
-actual hardware and allows the compiler to generate DWARF for languages that
-map a thread to the complete wavefront. It also allows more efficient DWARF to
-be generated to describe the CFI as only a single expression is required for
-the whole vector register, rather than a separate expression for each lane's
-dword of the vector register. It also allows the compiler to produce DWARF
-that indexes the vector register if it spills scalar registers into portions
-of a vector register.
+See :ref:`amdgpu-dwarf-call-frame-information`.
 
-Since DWARF stack value entries have a base type and AMDGPU registers are a
-vector of dwords, the ability to specify that a base type is a vector is
-required. See ``DW_AT_LLVM_vector_size``.
+2.3 Generalize DWARF Operation Expressions to Support Multiple Places
+---------------------------------------------------------------------
 
-If the source language is mapped onto the AMDGPU wavefronts in a SIMT manner,
-then the variable DWARF location expressions must compute the location for a
-single lane of the wavefront. Therefore, a DWARF operation is required to denote
-the current lane, much like ``DW_OP_push_object_address`` denotes the current
-object. The ``DW_OP_*piece`` operations only allow literal indices. Therefore, a
-way to use a computed offset of an arbitrary location description (such as a
-vector register) is required. See ``DW_OP_LLVM_push_lane``,
-``DW_OP_LLVM_offset``, ``DW_OP_LLVM_offset_uconst``, and
-``DW_OP_LLVM_bit_offset``.
-
-If the source language is mapped onto the AMDGPU wavefronts in a SIMT manner
-the compiler can use the AMDGPU execution mask register to control which lanes
-are active. To describe the conceptual location of non-active lanes a DWARF
-expression is needed that can compute a per lane PC. For efficiency, this is
-done for the wavefront as a whole. This expression benefits by having a masked
-select composite location description operation. This requires an attribute
-for source location of each lane. The AMDGPU may update the execution mask for
-whole wavefront operations and so needs an attribute that computes the current
-active lane mask. See ``DW_OP_LLVM_select_bit_piece``, ``DW_OP_LLVM_extend``,
-``DW_AT_LLVM_lane_pc``, and ``DW_AT_LLVM_active_lane``.
+In DWARF Version 5 a location description is defined as a single location
+description or a location list. A location list is defined as either
+effectively an undefined location description or as one or more single
+location descriptions to describe an object with multiple places.
+
+With
+:ref:`amdgpu-dwarf-allow-location-description-on-the-dwarf-evaluation-stack`,
+the ``DW_OP_push_object_address`` and ``DW_OP_call*`` operations can put a
+location description on the stack. Furthermore, debugger information entry
+attributes such as ``DW_AT_data_member_location``, ``DW_AT_use_location``, and
+``DW_AT_vtable_elem_location`` are defined as pushing a location description on
+the expression stack before evaluating the expression.
+
+DWARF Version 5 only allows the stack to contain values and so only a single
+memory address can be on the stack. This makes these operations and attributes
+incapable of handling location descriptions with multiple places, or places
+other than memory.
+
+Since
+:ref:`amdgpu-dwarf-allow-location-description-on-the-dwarf-evaluation-stack`
+allows the stack to contain location descriptions, the operations are
+generalized to support location descriptions that can have multiple places. This
+is backwards compatible with DWARF Version 5 and allows objects with multiple
+places to be supported. For example, the expression that describes how to access
+the field of an object can be evaluated with a location description that has
+multiple places and will result in a location description with multiple places.
+
+With this change, the separate DWARF Version 5 sections that described DWARF
+expressions and location lists are unified into a single section that describes
+DWARF expressions in general. This unification is a natural consequence of, and
+a necessity of, allowing location descriptions to be part of the evaluation
+stack.
+
+See :ref:`amdgpu-dwarf-location-description`.
+
+2.4 Generalize Offsetting of Location Descriptions
+--------------------------------------------------
+
+The ``DW_OP_plus`` and ``DW_OP_minus`` operations can be defined to operate on a
+memory location description in the default target architecture specific address
+space and a generic type value to produce an updated memory location
+description. This allows them to continue to be used to offset an address.
+
+To generalize offsetting to any location description, including location
+descriptions that describe when bytes are in registers, are implicit, or a
+composite of these, the ``DW_OP_LLVM_offset``, ``DW_OP_LLVM_offset_uconst``, and
+``DW_OP_LLVM_bit_offset`` offset operations are added.
+
+The offset operations can operate on location storage of any size. For example,
+implicit location storage could be any number of bits in size. It is simpler to
+define offsets that exceed the size of the location storage as being an
+evaluation error, than having to force an implementation to support potentially
+infinite precision offsets to allow it to correctly track a series of positive
+and negative offsets that may transiently overflow or underflow, but end up in
+range. This is simple for the arithmetic operations as they are defined in terms
+of two's compliment arithmetic on a base type of a fixed size. Therefore, the
+offset operation define that integer overflow is ill-formed. This is in contrast
+to the ``DW_OP_plus``, ``DW_OP_plus_uconst``, and ``DW_OP_minus`` arithmetic
+operations which define that it causes wrap-around.
+
+Having the offset operations allows ``DW_OP_push_object_address`` to push a
+location description that may be in a register, or be an implicit value. The
+DWARF expression of ``DW_TAG_ptr_to_member_type`` can use the offset operations
+without regard to what kind of location description was pushed.
+
+Since
+:ref:`amdgpu-dwarf-allow-location-description-on-the-dwarf-evaluation-stack` has
+generalized location storage to be bit indexable, ``DW_OP_LLVM_bit_offset``
+generalizes DWARF to work with bit fields. This is generally not possible in
+DWARF Version 5.
+
+The ``DW_OP_*piece`` operations only allow literal indices. A way to use a
+computed offset of an arbitrary location description (such as a vector register)
+is required. The offset operations provide this ability since they can be used
+to compute a location description on the stack.
+
+See ``DW_OP_LLVM_offset``, ``DW_OP_LLVM_offset_uconst``, and
+``DW_OP_LLVM_bit_offset`` in
+:ref:`amdgpu-dwarf-general-location-description-operations`.
+
+2.5 Generalize Creation of Undefined Location Descriptions
+----------------------------------------------------------
+
+Current DWARF uses an empty expression to indicate an undefined location
+description. Since
+:ref:`amdgpu-dwarf-allow-location-description-on-the-dwarf-evaluation-stack`
+allows location descriptions to be created on the stack, it is necessary to have
+an explicit way to specify an undefined location description.
+
+For example, the ``DW_OP_LLVM_select_bit_piece`` (see
+:ref:`amdgpu-dwarf-support-for-divergent-control-flow-of-simt-hardware`)
+operation takes more than one location description on the stack. Without this
+ability, it is not possible to specify that a particular one of the input
+location descriptions is undefined.
+
+See the ``DW_OP_LLVM_undefined`` operation in
+:ref:`amdgpu-dwarf-undefined-location-description-operations`.
+
+2.6 Generalize Creation of Composite Location Descriptions
+----------------------------------------------------------
+
+To allow composition of composite location descriptions, an explicit operation
+that indicates the end of the definition of a composite location description is
+required. This can be implied if the end of a DWARF expression is reached,
+allowing current DWARF expressions to remain legal.
+
+See ``DW_OP_LLVM_piece_end`` in
+:ref:`amdgpu-dwarf-composite-location-description-operations`.
+
+2.7 Generalize DWARF Base Objects to Allow Any Location Description Kind
+------------------------------------------------------------------------
+
+The number of registers and the cost of memory operations is much higher for
+AMDGPU than a typical CPU. The compiler attempts to optimize whole variables and
+arrays into registers.
+
+Currently DWARF only allows ``DW_OP_push_object_address`` and related operations
+to work with a global memory location. To support AMDGPU optimized code it is
+required to generalize DWARF to allow any location description to be used. This
+allows registers, or composite location descriptions that may be a mixture of
+memory, registers, or even implicit values.
+
+See ``DW_OP_push_object_address`` in
+:ref:`amdgpu-dwarf-general-location-description-operations`.
+
+2.8 General Support for Address Spaces
+--------------------------------------
 
 AMDGPU needs to be able to describe addresses that are in different kinds of
 memory. Optimized code may need to describe a variable that resides in pieces
 that are in different kinds of storage which may include parts of registers,
 memory that is in a mixture of memory kinds, implicit values, or be undefined.
+
 DWARF has the concept of segment addresses. However, the segment cannot be
 specified within a DWARF expression, which is only able to specify the offset
 portion of a segment address. The segment index is only provided by the entity
-that specifies the DWARF expression. Therefore, the segment index is a
-property that can only be put on complete objects, such as a variable. That
-makes it only suitable for describing an entity (such as variable or
-subprogram code) that is in a single kind of memory. Therefore, AMDGPU uses
-the DWARF concept of address spaces. For example, a variable may be allocated
-in a register that is partially spilled to the call stack which is in the
-private address space, and partially spilled to the local address space.
+that specifies the DWARF expression. Therefore, the segment index is a property
+that can only be put on complete objects, such as a variable. That makes it only
+suitable for describing an entity (such as variable or subprogram code) that is
+in a single kind of memory.
+
+Therefore, AMDGPU uses the DWARF concept of address spaces. For example, a
+variable may be allocated in a register that is partially spilled to the call
+stack which is in the private address space, and partially spilled to the local
+address space.
 
 DWARF uses the concept of an address in many expression operations but does not
 define how it relates to address spaces. For example,
 ``DW_OP_push_object_address`` pushes the address of an object. Other contexts
 implicitly push an address on the stack before evaluating an expression. For
 example, the ``DW_AT_use_location`` attribute of the
-``DW_TAG_ptr_to_member_type``. The expression that uses the address needs to
-do so in a general way and not need to be dependent on the address space of
-the address. For example, a pointer to member value may want to be applied to
-an object that may reside in any address space.
-
-The number of registers and the cost of memory operations is much higher for
-AMDGPU than a typical CPU. The compiler attempts to optimize whole variables
-and arrays into registers. Currently DWARF only allows
-``DW_OP_push_object_address`` and related operations to work with a global
-memory location. To support AMDGPU optimized code it is required to generalize
-DWARF to allow any location description to be used. This allows registers, or
-composite location descriptions that may be a mixture of memory, registers, or
-even implicit values.
-
-DWARF Version 5 does not allow location descriptions to be entries on the
-DWARF stack. They can only be the final result of the evaluation of a DWARF
-expression. However, by allowing a location description to be a first-class
-entry on the DWARF stack it becomes possible to compose expressions containing
-both values and location descriptions naturally. It allows objects to be
-located in any kind of memory address space, in registers, be implicit values,
-be undefined, or a composite of any of these. By extending DWARF carefully,
-all existing DWARF expressions can retain their current semantic meaning.
-DWARF has implicit conversions that convert from a value that represents an
-address in the default address space to a memory location description. This
-can be extended to allow a default address space memory location description
-to be implicitly converted back to its address value. This allows all DWARF
-Version 5 expressions to retain their same meaning, while adding the ability
-to explicitly create memory location descriptions in non-default address
-spaces and generalizing the power of composite location descriptions to any
-kind of location description. See :ref:`amdgpu-dwarf-operation-expressions`.
-
-To allow composition of composite location descriptions, an explicit operation
-that indicates the end of the definition of a composite location description
-is required. This can be implied if the end of a DWARF expression is reached,
-allowing current DWARF expressions to remain legal. See
-``DW_OP_LLVM_piece_end``.
-
-The ``DW_OP_plus`` and ``DW_OP_minus`` can be defined to operate on a memory
-location description in the default target architecture specific address space
-and a generic type value to produce an updated memory location description. This
-allows them to continue to be used to offset an address. To generalize
-offsetting to any location description, including location descriptions that
-describe when bytes are in registers, are implicit, or a composite of these, the
-``DW_OP_LLVM_offset``, ``DW_OP_LLVM_offset_uconst``, and
-``DW_OP_LLVM_bit_offset`` offset operations are added. Unlike ``DW_OP_plus``,
-``DW_OP_plus_uconst``, and ``DW_OP_minus`` arithmetic operations, these do not
-define that integer overflow causes wrap-around. The offset operations can
-operate on location storage of any size. For example, implicit location storage
-could be any number of bits in size. It is simpler to define offsets that exceed
-the size of the location storage as being an evaluation error, than having to
-force an implementation to support potentially infinite precision offsets to
-allow it to correctly track a series of positive and negative offsets that may
-transiently overflow or underflow, but end up in range. This is simple for the
-arithmetic operations as they are defined in terms of two's compliment
-arithmetic on a base type of a fixed size.
-
-Having the offset operations allows ``DW_OP_push_object_address`` to push a
-location description that may be in a register, or be an implicit value, and the
-DWARF expression of ``DW_TAG_ptr_to_member_type`` can contain them to offset
-within it. ``DW_OP_LLVM_bit_offset`` generalizes DWARF to work with bit fields
-which is not possible in DWARF Version 5.
+``DW_TAG_ptr_to_member_type``. The expression belongs to a source language type
+which may apply to objects allocated in different kinds of storage. Therefore,
+it is desirable that the expression that uses the address can do so without
+regard to what kind of storage it specifies, including the address space of a
+memory location description. For example, a pointer to member value may want to
+be applied to an object that may reside in any address space.
 
 The DWARF ``DW_OP_xderef*`` operations allow a value to be converted into an
 address of a specified address space which is then read. But it provides no
 way to create a memory location description for an address in the non-default
 address space. For example, AMDGPU variables can be allocated in the local
-address space at a fixed address. It is required to have an operation to
-create an address in a specific address space that can be used to define the
-location description of the variable. Defining this operation to produce a
-location description allows the size of addresses in an address space to be
-larger than the generic type. See ``DW_OP_LLVM_form_aspace_address``.
-
-If the ``DW_OP_LLVM_form_aspace_address`` operation had to produce a value
-that can be implicitly converted to a memory location description, then it
-would be limited to the size of the generic type which matches the size of the
-default address space. Its value would be undefined and likely not match any
-value in the actual program. By making the result a location description, it
-allows a consumer great freedom in how it implements it. The implicit
-conversion back to a value can be limited only to the default address space to
-maintain compatibility with DWARF Version 5. For other address spaces the
-producer can use the new operations that explicitly specify the address space.
+address space at a fixed address.
+
+The ``DW_OP_LLVM_form_aspace_address`` (see
+:ref:`amdgpu-dwarf-memory-location-description-operations`) operation is defined
+to create a memory location description from an address and address space. If
+can be used to specify the location of a variable that is allocated in a
+specific address space. This allows the size of addresses in an address space to
+be larger than the generic type. It also allows a consumer great implementation
+freedom. It allows the implicit conversion back to a value to be limited only to
+the default address space to maintain compatibility with DWARF Version 5. For
+other address spaces the producer can use the new operations that explicitly
+specify the address space.
+
+In contrast, if the ``DW_OP_LLVM_form_aspace_address`` operation had been
+defined to produce a value, and an implicit conversion to a memory location
+description was defined, then it would be limited to the size of the generic
+type (which matches the size of the default address space). An implementation
+would likely have to use *reserved ranges* of value to represent different
+address spaces. Such a value would likely not match any address value in the
+actual hardware. That would require the consumer to have special treatment for
+such values.
 
 ``DW_OP_breg*`` treats the register as containing an address in the default
-address space. It is required to be able to specify the address space of the
-register value. See ``DW_OP_LLVM_aspace_bregx``.
+address space. A ``DW_OP_LLVM_aspace_bregx`` (see
+:ref:`amdgpu-dwarf-memory-location-description-operations`) operation is added
+to allow the address space of the address held in a register to be specified.
 
-Similarly, ``DW_OP_implicit_pointer`` treats its implicit pointer value as
-being in the default address space. It is required to be able to specify the
-address space of the pointer value. See
-``DW_OP_LLVM_aspace_implicit_pointer``.
+Similarly, ``DW_OP_implicit_pointer`` treats its implicit pointer value as being
+in the default address space. A ``DW_OP_LLVM_aspace_implicit_pointer``
+(:ref:`amdgpu-dwarf-implicit-location-description-operations`) operation is
+added to allow the address space to be specified.
 
 Almost all uses of addresses in DWARF are limited to defining location
 descriptions, or to be dereferenced to read memory. The exception is
-``DW_CFA_val_offset`` which uses the address to set the value of a register.
-By defining the CFA DWARF expression as being a memory location description,
-it can maintain what address space it is, and that can be used to convert the
-offset address back to an address in that address space. See
+``DW_CFA_val_offset`` which uses the address to set the value of a register. In
+order to support address spaces, the CFA DWARF expression is defined to be a
+memory location description. This allows it to specify an address space which is
+used to convert the offset address back to an address in that address space. See
 :ref:`amdgpu-dwarf-call-frame-information`.
 
-This approach allows all existing DWARF to have the identical semantics. It
-allows the compiler to explicitly specify the address space it is using. For
-example, a compiler could choose to access private memory in a swizzled manner
-when mapping a source language to a wavefront in a SIMT manner, or to access
-it in an unswizzled manner if mapping the same language with the wavefront
-being the thread. It also allows the compiler to mix the address space it uses
-to access private memory. For example, for SIMT it can still spill entire
-vector registers in an unswizzled manner, while using a swizzled private
-memory for SIMT variable access. This approach allows memory location
-descriptions for different address spaces to be combined using the regular
-``DW_OP_*piece`` operations.
-
-Location descriptions are an abstraction of storage, they give freedom to the
+This approach of extending memory location descriptions to support address
+spaces, allows all existing DWARF Version 5 expressions to have the identical
+semantics. It allows the compiler to explicitly specify the address space it is
+using. For example, a compiler could choose to access private memory in a
+swizzled manner when mapping a source language thread to the lane of a wavefront
+in a SIMT manner. Or a compiler could choose to access it in an unswizzled
+manner if mapping the same language with the wavefront being the thread.
+
+It also allows the compiler to mix the address space it uses to access private
+memory. For example, for SIMT it can still spill entire vector registers in an
+unswizzled manner, while using a swizzled private memory for SIMT variable
+access.
+
+This approach also allows memory location descriptions for different address
+spaces to be combined using the regular ``DW_OP_*piece`` operations.
+
+Location descriptions are an abstraction of storage. They give freedom to the
 consumer on how to implement them. They allow the address space to encode lane
-information so they can be used to read memory with only the memory
-description and no extra arguments. The same set of operations can operate on
+information so they can be used to read memory with only the memory location
+description and no extra information. The same set of operations can operate on
 locations independent of their kind of storage. The ``DW_OP_deref*`` therefore
-can be used on any storage kind. ``DW_OP_xderef*`` is unnecessary, except to
-become a more compact way to convert a non-default address space address
-followed by dereferencing it.
+can be used on any storage kind, including memory location descriptions of
+different address spaces. Therefore, the ``DW_OP_xderef*`` operations are
+unnecessary, except to become a more compact way to encode a non-default address
+space address followed by dereferencing it. See
+:ref:`amdgpu-dwarf-general-operations`.
 
-In DWARF Version 5 a location description is defined as a single location
-description or a location list. A location list is defined as either
-effectively an undefined location description or as one or more single
-location descriptions to describe an object with multiple places. The
-``DW_OP_push_object_address`` and ``DW_OP_call*`` operations can put a
-location description on the stack. Furthermore, debugger information entry
-attributes such as ``DW_AT_data_member_location``, ``DW_AT_use_location``, and
-``DW_AT_vtable_elem_location`` are defined as pushing a location description
-on the expression stack before evaluating the expression. However, DWARF
-Version 5 only allows the stack to contain values and so only a single memory
-address can be on the stack which makes these incapable of handling location
-descriptions with multiple places, or places other than memory. Since these
-extensions allow the stack to contain location descriptions, the operations are
-generalized to support location descriptions that can have multiple places.
-This is backwards compatible with DWARF Version 5 and allows objects with
-multiple places to be supported. For example, the expression that describes
-how to access the field of an object can be evaluated with a location
-description that has multiple places and will result in a location description
-with multiple places as expected. With this change, the separate DWARF Version
-5 sections that described DWARF expressions and location lists have been
-unified into a single section that describes DWARF expressions in general.
-This unification seems to be a natural consequence and a necessity of allowing
-location descriptions to be part of the evaluation stack.
+2.9 Support for Vector Base Types
+---------------------------------
 
-For those familiar with the definition of location descriptions in DWARF Version
-5, the definitions in these extensions are presented differently, but does
-in fact define the same concept with the same fundamental semantics. However,
-it does so in a way that allows the concept to extend to support address
-spaces, bit addressing, the ability for composite location descriptions to be
-composed of any kind of location description, and the ability to support
-objects located at multiple places. Collectively these changes expand the set
-of processors that can be supported and improves support for optimized code.
-
-Several approaches were considered, and the one presented appears to be the
-cleanest and offers the greatest improvement of DWARF's ability to support
-optimized code. Examining the GDB debugger and LLVM compiler, it appears only
-to require modest changes as they both already have to support general use of
-location descriptions. It is anticipated that will also be the case for other
-debuggers and compilers.
+The vector registers of the AMDGPU are represented as their full wavefront
+size, meaning the wavefront size times the dword size. This reflects the
+actual hardware and allows the compiler to generate DWARF for languages that
+map a thread to the complete wavefront. It also allows more efficient DWARF to
+be generated to describe the CFI as only a single expression is required for
+the whole vector register, rather than a separate expression for each lane's
+dword of the vector register. It also allows the compiler to produce DWARF
+that indexes the vector register if it spills scalar registers into portions
+of a vector register.
 
-As an experiment, GDB was modified to evaluate DWARF Version 5 expressions
-with location descriptions as stack entries and implicit conversions. All GDB
-tests have passed, except one that turned out to be an invalid test by DWARF
-Version 5 rules. The code in GDB actually became simpler as all evaluation was
-on the stack and there was no longer a need to maintain a separate structure
-for the location description result. This gives confidence of the backwards
-compatibility.
+Since DWARF stack value entries have a base type and AMDGPU registers are a
+vector of dwords, the ability to specify that a base type is a vector is
+required.
+
+See ``DW_AT_LLVM_vector_size`` in :ref:`amdgpu-dwarf-literal-operations`.
+
+.. _amdgpu-dwarf-operation-to-create-vector-composite-location-descriptions:
+
+2.10 DWARF Operations to Create Vector Composite Location Descriptions
+----------------------------------------------------------------------
+
+AMDGPU optimized code may spill vector registers to non-global address space
+memory, and this spilling may be done only for SIMT lanes that are active on
+entry to the subprogram.
+
+To support this, a composite location description that can be created as a
+masked select is required. In addition, an operation that creates a composite
+location description that is a vector on another location description is needed.
+
+An example that uses these operations is referenced in the
+:ref:`amdgpu-dwarf-examples` appendix.
+
+See ``DW_OP_LLVM_select_bit_piece`` and ``DW_OP_LLVM_extend`` in
+:ref:`amdgpu-dwarf-composite-location-description-operations`.
+
+2.11 DWARF Operation to Access Call Frame Entry Registers
+---------------------------------------------------------
+
+As described in
+:ref:`amdgpu-dwarf-operation-to-create-vector-composite-location-descriptions`,
+a DWARF expression involving the set of SIMT lanes active on entry to a
+subprogram is required. The SIMT active lane mask may be held in a register that
+is modified as the subprogram executes. However, its value may be saved on entry
+to the subprogram.
+
+The  Call Frame Information (CFI) already encodes such register saving, so it is
+more efficient to provide an operation to return the location of a saved
+register than have to generate a loclist to describe the same information. This
+is now possible since
+:ref:`amdgpu-dwarf-allow-location-description-on-the-dwarf-evaluation-stack`
+allows location descriptions on the stack.
+
+See ``DW_OP_LLVM_call_frame_entry_reg`` in
+:ref:`amdgpu-dwarf-general-location-description-operations` and
+:ref:`amdgpu-dwarf-call-frame-information`.
+
+2.12 Support for Source Languages Mapped to SIMT Hardware
+---------------------------------------------------------
+
+If the source language is mapped onto the AMDGPU wavefronts in a SIMT manner,
+then the variable DWARF location expressions must compute the location for a
+single lane of the wavefront. Therefore, a DWARF operation is required to denote
+the current lane, much like ``DW_OP_push_object_address`` denotes the current
+object.
+
+See ``DW_OP_LLVM_push_lane`` in :ref:`amdgpu-dwarf-base-type-entries`.
+
+.. _amdgpu-dwarf-support-for-divergent-control-flow-of-simt-hardware:
+
+2.13 Support for Divergent Control Flow of SIMT Hardware
+--------------------------------------------------------
+
+If the source language is mapped onto the AMDGPU wavefronts in a SIMT manner the
+compiler can use the AMDGPU execution mask register to control which lanes are
+active. To describe the conceptual location of non-active lanes requires an
+attribute that has an expression that computes the source location PC for each
+lane.
+
+For efficiency, the expression calculates the source location the wavefront as a
+whole. This can be done using the ``DW_OP_LLVM_select_bit_piece`` (see
+:ref:`amdgpu-dwarf-operation-to-create-vector-composite-location-descriptions`)
+operation.
+
+The AMDGPU may update the execution mask to perform whole wavefront operations.
+Therefore, there is a need for an attribute that computes the current active
+lane mask. This can have an expression that may evaluate to the SIMT active lane
+mask register or to a saved mask when in whole wavefront execution mode.
+
+An example that uses these attributes is referenced in the
+:ref:`amdgpu-dwarf-examples` appendix.
+
+See ``DW_AT_LLVM_lane_pc`` and ``DW_AT_LLVM_active_lane`` in
+:ref:`amdgpu-dwarf-composite-location-description-operations`.
+
+2.14 Define Source Language Address Classes
+-------------------------------------------
+
+AMDGPU supports languages, such as OpenCL [:ref:`OpenCL <amdgpu-dwarf-OpenCL>`],
+that define source language address classes. Support is added to define language
+specific address classes so they can be used in a consistent way by consumers.
+
+It would also be desirable to add support for using address classes in defining
+source language types. DWARF Version 5 only supports using target architecture
+specific address spaces.
+
+See :ref:`amdgpu-dwarf-segment_addresses`.
+
+2.15 Define Augmentation Strings to Support Multiple Extensions
+---------------------------------------------------------------
+
+A ``DW_AT_LLVM_augmentation`` attribute is added to a compilation unit debugger
+information entry to indicate that there is additional target architecture
+specific information in the debugging information entries of that compilation
+unit. This allows a consumer to know what extensions are present in the debugger
+information entries as is possible with the augmentation string of other
+sections. See .
+
+The format that should be used for an augmentation string is also recommended.
+This allows a consumer to parse the string when it contains information from
+multiple vendors. Augmentation strings occur in the ``DW_AT_LLVM_augmentation``
+attribute, in the lookup by name table, and in the CFI Common Information Entry
+(CIE).
 
-Since the AMDGPU supports languages such as OpenCL [:ref:`OpenCL
-<amdgpu-dwarf-OpenCL>`], there is a need to define source language address
-classes so they can be used in a consistent way by consumers. It would also be
-desirable to add support for using them in defining language types rather than
-the current target architecture specific address spaces. See
-:ref:`amdgpu-dwarf-segment_addresses`.
-
-A ``DW_AT_LLVM_augmentation`` attribute is added to a compilation unit
-debugger information entry to indicate that there is additional target
-architecture specific information in the debugging information entries of that
-compilation unit. This allows a consumer to know what extensions are present
-in the debugger information entries as is possible with the augmentation
-string of other sections. The format that should be used for the augmentation
-string in the lookup by name table and CFI Common Information Entry is also
-recommended to allow a consumer to parse the string when it contains
-information from multiple vendors.
-
-The AMDGPU supports programming languages that include online compilation
-where the source text may be created at runtime. Therefore, a way to embed the
-source text in the debug information is required. For example, the OpenCL
-language runtime supports online compilation. See
-:ref:`amdgpu-dwarf-line-number-information`.
-
-Support to allow MD5 checksums to be optionally present in the line table is
-added. This allows linking together compilation units where some have MD5
-checksums and some do not. In DWARF Version 5 the file timestamp and file size
-can be optional, but if the MD5 checksum is present it must be valid for all
-files. See :ref:`amdgpu-dwarf-line-number-information`.
-
-Support is added for the HIP programming language [:ref:`HIP
-<amdgpu-dwarf-HIP>`] which is supported by the AMDGPU. See
-:ref:`amdgpu-dwarf-language-names`.
-
-The following sections provide the definitions for the additional operations,
-as well as clarifying how existing expression operations, CFI operations, and
-attributes behave with respect to generalized location descriptions that
-support address spaces and location descriptions that support multiple places.
-It has been defined such that it is backwards compatible with DWARF Version 5.
-The definitions are intended to fully define well-formed DWARF in a consistent
-style based on the DWARF Version 5 specification. Non-normative text is shown
-in *italics*.
-
-The names for the new operations, attributes, and constants include "\
-``LLVM``\ " and are encoded with vendor specific codes so these extensions can
-be implemented as an LLVM vendor extension to DWARF Version 5. If accepted these
-names would not include the "\ ``LLVM``\ " and would not use encodings in the
-vendor range.
-
-The extensions are described in
-:ref:`amdgpu-dwarf-changes-relative-to-dwarf-version-5` and are
-organized to follow the section ordering of DWARF Version 5. It includes notes
-to indicate the corresponding DWARF Version 5 sections to which they pertain.
-Other notes describe additional changes that may be worth considering, and to
-raise questions.
+See :ref:`amdgpu-dwarf-full-and-partial-compilation-unit-entries`,
+:ref:`amdgpu-dwarf-name-index-section-header`, and
+:ref:`amdgpu-dwarf-structure_of-call-frame-information`.
+
+2.16 Support Embedding Source Text for Online Compilation
+---------------------------------------------------------
+
+AMDGPU supports programming languages that include online compilation where the
+source text may be created at runtime. For example, the OpenCL and HIP language
+runtimes support online compilation. To support is, a way to embed the source
+text in the debug information is provided.
+
+See :ref:`amdgpu-dwarf-line-number-information`.
+
+2.17 Allow MD5 Checksums to be Optionally Present
+-------------------------------------------------
+
+In DWARF Version 5 the file timestamp and file size can be optional, but if the
+MD5 checksum is present it must be valid for all files. This is a problem if
+using link time optimization to combine compilation units where some have MD5
+checksums and some do not. Therefore, sSupport to allow MD5 checksums to be
+optionally present in the line table is added.
+
+See :ref:`amdgpu-dwarf-line-number-information`.
+
+2.18 Add the HIP Programing Language
+------------------------------------
+
+The HIP programming language [:ref:`HIP <amdgpu-dwarf-HIP>`], which is supported
+by the AMDGPU, is added.
+
+See :ref:`amdgpu-dwarf-language-names-table`.
 
 .. _amdgpu-dwarf-changes-relative-to-dwarf-version-5:
 
-Changes Relative to DWARF Version 5
-===================================
+A. Changes Relative to DWARF Version 5
+======================================
+
+.. note::
+
+  This appendix provides changes relative to DWARF Version 5. It has been
+  defined such that it is backwards compatible with DWARF Version 5.
+  Non-normative text is shown in *italics*. The section numbers generally
+  correspond to those in the DWARF Version 5 standard unless specified
+  otherwise. Definitions are given for the additional operations, as well as
+  clarifying how existing expression operations, CFI operations, and attributes
+  behave with respect to generalized location descriptions that support address
+  spaces and multiple places.
+
+  The names for the new operations, attributes, and constants include "\
+  ``LLVM``\ " and are encoded with vendor specific codes so these extensions can
+  be implemented as an LLVM vendor extension to DWARF Version 5.
+
+  .. note::
+
+    Notes are included to describe how the changes are to be applied to the
+    DWARF Version 5 standard. They also describe rational and issues that may
+    need further consideration.
 
-General Description
--------------------
+A.2 General Description
+-----------------------
 
-Attribute Types
-~~~~~~~~~~~~~~~
+A.2.2 Attribute Types
+~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
   This augments DWARF Version 5 section 2.2 and Table 2.2.
 
-The following table provides the additional attributes. See
-:ref:`amdgpu-dwarf-debugging-information-entry-attributes`.
+The following table provides the additional attributes.
 
 .. table:: Attribute names
    :name: amdgpu-dwarf-attribute-names-table
@@ -441,17 +599,17 @@
    =========================== ====================================
    Attribute                   Usage
    =========================== ====================================
-   ``DW_AT_LLVM_active_lane``  SIMD or SIMT active lanes
-   ``DW_AT_LLVM_augmentation`` Compilation unit augmentation string
-   ``DW_AT_LLVM_lane_pc``      SIMD or SIMT lane program location
-   ``DW_AT_LLVM_lanes``        SIMD or SIMT thread lane count
-   ``DW_AT_LLVM_vector_size``  Base type vector size
+   ``DW_AT_LLVM_active_lane``  SIMD or SIMT active lanes (see :ref:`amdgpu-dwarf-low-level-information`)
+   ``DW_AT_LLVM_augmentation`` Compilation unit augmentation string (see :ref:`amdgpu-dwarf-full-and-partial-compilation-unit-entries`)
+   ``DW_AT_LLVM_lane_pc``      SIMD or SIMT lane program location (see :ref:`amdgpu-dwarf-low-level-information`)
+   ``DW_AT_LLVM_lanes``        SIMD or SIMT thread lane count (see :ref:`amdgpu-dwarf-low-level-information`)
+   ``DW_AT_LLVM_vector_size``  Base type vector size (see :ref:`amdgpu-dwarf-base-type-entries`)
    =========================== ====================================
 
 .. _amdgpu-dwarf-expressions:
 
-DWARF Expressions
-~~~~~~~~~~~~~~~~~
+A.2.5 DWARF Expressions
+~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
@@ -506,8 +664,8 @@
 
 .. _amdgpu-dwarf-expression-evaluation-context:
 
-DWARF Expression Evaluation Context
-+++++++++++++++++++++++++++++++++++
+A.2.5.1 DWARF Expression Evaluation Context
++++++++++++++++++++++++++++++++++++++++++++
 
 A DWARF expression is evaluated in a context that can include a number of
 context elements.  If multiple context elements are specified then they must be
@@ -526,9 +684,9 @@
 
   It is required for operations that are related to target architecture threads.
 
-  *For example, the* ``DW_OP_form_tls_address`` *operation and*
-  ``DW_OP_LLVM_form_aspace_address`` *operation when given an address space that
-  is thread specific.*
+  *For example, the* ``DW_OP_regval_type`` *operation, or the*
+  ``DW_OP_form_tls_address`` *and* ``DW_OP_LLVM_form_aspace_address``
+  *operations when given an address space that is thread specific.*
 
 *A current lane*
 
@@ -618,10 +776,10 @@
 
   *Note that this compilation unit may not be the same as the compilation unit
   determined from the loaded code object corresponding to the current program
-  location. For example, the evaluation of the expression E associated with a
-  ``DW_AT_location`` attribute of the debug information entry operand of the
-  ``DW_OP_call*`` operations is evaluated with the compilation unit that
-  contains E and not the one that contains the ``DW_OP_call*`` operation
+  location. For example, the evaluation of the expression E associated with a*
+  ``DW_AT_location`` *attribute of the debug information entry operand of the*
+  ``DW_OP_call*`` *operations is evaluated with the compilation unit that
+  contains E and not the one that contains the* ``DW_OP_call*`` *operation
   expression.*
 
 *A current target architecture*
@@ -641,7 +799,7 @@
     must be the same as the target architecture of the current thread.
 
   * If the current compilation unit is specified, then the current target
-    architecture default address space address size must be the same as he
+    architecture default address space address size must be the same as the
     ``address_size`` field in the header of the current compilation unit and any
     associated entry in the ``.debug_aranges`` section.
 
@@ -651,7 +809,7 @@
     corresponding to the current program location.
 
   * If the current program location is specified, then the current target
-    architecture default address space address size must be the same as he
+    architecture default address space address size must be the same as the
     ``address_size`` field in the header of any entry corresponding to the
     current program location in the ``.debug_addr``, ``.debug_line``,
     ``.debug_rnglists``, ``.debug_rnglists.dwo``, ``.debug_loclists``, and
@@ -666,9 +824,8 @@
   It is required for the ``DW_OP_push_object_address`` operation.
 
   *For example, the* ``DW_AT_data_location`` *attribute on type debug
-  information entries specifies the the program object corresponding to a
-  runtime descriptor as the current object when it evaluates its associated
-  expression.*
+  information entries specifies the program object corresponding to a runtime
+  descriptor as the current object when it evaluates its associated expression.*
 
   The result is undefined if the location descriptor is invalid (see
   :ref:`amdgpu-dwarf-location-description`).
@@ -689,7 +846,7 @@
 If the evaluation requires a context element that is not specified, then the
 result of the evaluation is an error.
 
-*A DWARF expression for the location description may be able to be evaluated
+*A DWARF expression for a location description may be able to be evaluated
 without a thread, lane, call frame, program location, or architecture context.
 For example, the location of a global variable may be able to be evaluated
 without such context. If the expression evaluates with an error then it may
@@ -707,8 +864,8 @@
 
 .. _amdgpu-dwarf-expression-value:
 
-DWARF Expression Value
-++++++++++++++++++++++
+A.2.5.2 DWARF Expression Value
+++++++++++++++++++++++++++++++
 
 A value has a type and a literal value. It can represent a literal value of any
 supported base type of the target architecture. The base type specifies the
@@ -744,8 +901,8 @@
 
 .. _amdgpu-dwarf-location-description:
 
-DWARF Location Description
-++++++++++++++++++++++++++
+A.2.5.3 DWARF Location Description
+++++++++++++++++++++++++++++++++++
 
 *Debugging information must provide consumers a way to find the location of
 program variables, determine the bounds of dynamic arrays and strings, and
@@ -799,16 +956,19 @@
   provided by the operations.
 
 *Location descriptions are a language independent representation of addressing
-rules. They are created using DWARF operation expressions of arbitrary
-complexity. They can be the result of evaluating a debugger information entry
-attribute that specifies an operation expression. In this usage they can
-describe the location of an object as long as its lifetime is either static or
-the same as the lexical block (see DWARF Version 5 section 3.5) that owns it,
-and it does not move during its lifetime. They can be the result of evaluating a
-debugger information entry attribute that specifies a location list expression.
-In this usage they can describe the location of an object that has a limited
-lifetime, changes its location during its lifetime, or has multiple locations
-over part or all of its lifetime.*
+rules.*
+
+* *They can be the result of evaluating a debugger information entry attribute
+  that specifies an operation expression of arbitrary complexity. In this usage
+  they can describe the location of an object as long as its lifetime is either
+  static or the same as the lexical block (see
+  :ref:`amdgpu-dwarf-lexical-block-entries`) that owns it, and it does not move
+  during its lifetime.*
+
+* *They can be the result of evaluating a debugger information entry attribute
+  that specifies a location list expression. In this usage they can describe the
+  location of an object that has a limited lifetime, changes its location during
+  its lifetime, or has multiple locations over part or all of its lifetime.*
 
 If a location description has more than one single location description, the
 DWARF expression is ill-formed if the object value held in each single location
@@ -884,8 +1044,8 @@
 
 .. _amdgpu-dwarf-operation-expressions:
 
-DWARF Operation Expressions
-+++++++++++++++++++++++++++
+A.2.5.4 DWARF Operation Expressions
++++++++++++++++++++++++++++++++++++
 
 An operation expression is comprised of a stream of operations, each consisting
 of an opcode followed by zero or more operands. The number of operands is
@@ -963,7 +1123,7 @@
 specifies the byte count. It can be used:
 
 * as the value of a debugging information entry attribute that is encoded using
-  class ``exprloc`` (see DWARF Version 5 section 7.5.5),
+  class ``exprloc`` (see :ref:`amdgpu-dwarf-classes-and-forms`),
 
 * as the operand to certain operation expression operations,
 
@@ -975,8 +1135,12 @@
 
 .. _amdgpu-dwarf-stack-operations:
 
-Stack Operations
-################
+A.2.5.4.1 Stack Operations
+##########################
+
+.. note::
+
+  This section replaces DWARF Version 5 section 2.5.1.3.
 
 The following operations manipulate the DWARF stack. Operations that index the
 stack assume that the top of the stack (most recently added entry) has index 0.
@@ -1018,7 +1182,7 @@
 
     ``DW_OP_over`` pushes a copy of the entry with index 1.
 
-    *This is equivalent to a ``DW_OP_pick 1`` operation.*
+    *This is equivalent to a* ``DW_OP_pick 1`` *operation.*
 
 5.  ``DW_OP_swap``
 
@@ -1034,8 +1198,12 @@
 
 .. _amdgpu-dwarf-control-flow-operations:
 
-Control Flow Operations
-#######################
+A.2.5.4.2 Control Flow Operations
+#################################
+
+.. note::
+
+  This section replaces DWARF Version 5 section 2.5.1.5.
 
 The following operations provide simple control of the flow of a DWARF operation
 expression.
@@ -1097,7 +1265,7 @@
     relative to the beginning of the ``.debug_info`` section that contains the
     current compilation unit. D may not be in the current compilation unit.
 
-    .. note:
+    .. note::
 
       DWARF Version 5 states that DR can be an offset in a ``.debug_info``
       section other than the one that contains the current compilation unit. It
@@ -1176,14 +1344,14 @@
         entry is to push just one location description on the stack. That
         location description may have more than one single location description.
 
-        The previous rule for ``exprloc`` also has the same problem as normally
+        The previous rule for ``exprloc`` also has the same problem, as normally
         a variable or formal parameter location expression may leave multiple
         entries on the stack and only return the top entry.
 
         GDB implements ``DW_OP_call*`` by always executing E on the same stack.
         If the location list has multiple matching entries, it simply picks the
         first one and ignores the rest. This seems fundamentally at odds with
-        the desire to supporting multiple places for variables.
+        the desire to support multiple places for variables.
 
         So, it feels like ``DW_OP_call*`` should both support pushing a location
         description on the stack for a variable or formal parameter, and also
@@ -1234,8 +1402,8 @@
 
       *This allows a call operation to be used to compute the location
       description for any variable or formal parameter regardless of whether the
-      producer has optimized it to a constant. This is consistent with the
-      ``DW_OP_implicit_pointer`` operation.*
+      producer has optimized it to a constant. This is consistent with the*
+      ``DW_OP_implicit_pointer`` *operation.*
 
       .. note::
 
@@ -1264,12 +1432,12 @@
 
 .. _amdgpu-dwarf-value-operations:
 
-Value Operations
-################
+A.2.5.4.3 Value Operations
+##########################
 
 This section describes the operations that push values on the stack.
 
-Each value stack entry has a type and a literal value and can represent a
+Each value stack entry has a type and a literal value. It can represent a
 literal value of any supported base type of the target architecture. The base
 type specifies the size, encoding, and endianity of the literal value.
 
@@ -1277,8 +1445,12 @@
 
 .. _amdgpu-dwarf-literal-operations:
 
-Literal Operations
-^^^^^^^^^^^^^^^^^^
+A.2.5.4.3.1 Literal Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  This section replaces DWARF Version 5 section 2.5.1.1.
 
 The following operations all push a literal value onto the DWARF stack.
 
@@ -1325,7 +1497,7 @@
     link-time relocation but should not be interpreted by the consumer as a
     relocatable address (for example, offsets to thread-local storage).*
 
-9.  ``DW_OP_const_type``
+7.  ``DW_OP_const_type``
 
     ``DW_OP_const_type`` has three operands. The first is an unsigned LEB128
     integer DR that represents the byte offset of a debugging information entry
@@ -1346,7 +1518,7 @@
     operation can be parsed easily without reference to the* ``.debug_info``
     *section.*
 
-10. ``DW_OP_LLVM_push_lane`` *New*
+8.  ``DW_OP_LLVM_push_lane`` *New*
 
     ``DW_OP_LLVM_push_lane`` pushes the target architecture lane identifier of
     the current lane as a value with the generic type.
@@ -1357,8 +1529,8 @@
 
 .. _amdgpu-dwarf-arithmetic-logical-operations:
 
-Arithmetic and Logical Operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A.2.5.4.3.2 Arithmetic and Logical Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. note::
 
@@ -1366,8 +1538,8 @@
 
 .. _amdgpu-dwarf-type-conversions-operations:
 
-Type Conversion Operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+A.2.5.4.3.3 Type Conversion Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. note::
 
@@ -1375,8 +1547,13 @@
 
 .. _amdgpu-dwarf-general-operations:
 
-Special Value Operations
-^^^^^^^^^^^^^^^^^^^^^^^^
+A.2.5.4.3.4 Special Value Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  This section replaces parts of DWARF Version 5 sections 2.5.1.2, 2.5.1.3, and
+  2.5.1.7.
 
 There are these special value operations currently defined:
 
@@ -1511,8 +1688,8 @@
     undefined location storage or the offset of any bit exceeds the size of the
     location storage LS specified by any single location description SL of L.
 
-    See :ref:`amdgpu-dwarf-implicit-location-descriptions` for special rules
-    concerning implicit location descriptions created by the
+    See :ref:`amdgpu-dwarf-implicit-location-description-operations` for special
+    rules concerning implicit location descriptions created by the
     ``DW_OP_implicit_pointer`` and ``DW_OP_LLVM_implicit_aspace_pointer``
     operations.
 
@@ -1559,8 +1736,8 @@
     represents a target architecture specific address space identifier AS.
 
     The operation is equivalent to performing ``DW_OP_swap;
-    DW_OP_LLVM_form_aspace_address; DW_OP_deref_type S R``. The value V
-    retrieved is left on the stack with the type D.
+    DW_OP_LLVM_form_aspace_address; DW_OP_deref_type S DR``. The value V
+    retrieved is left on the stack with the type T.
 
     *This operation is deprecated as the* ``DW_OP_LLVM_form_aspace_address``
     *operation can be used and provides greater expressiveness.*
@@ -1585,17 +1762,17 @@
     frame information (see :ref:`amdgpu-dwarf-call-frame-information`).
 
     If the result of E is a location description L (see
-    :ref:`amdgpu-dwarf-register-location-descriptions`), and the last operation
-    executed by E is a ``DW_OP_reg*`` for register R with a target architecture
-    specific base type of T, then the contents of the register are retrieved as
-    if a ``DW_OP_deref_type DR`` operation was performed where DR is the offset
-    of a hypothetical debug information entry in the current compilation unit
-    for T. The resulting value V s pushed on the stack.
+    :ref:`amdgpu-dwarf-register-location-description-operations`), and the last
+    operation executed by E is a ``DW_OP_reg*`` for register R with a target
+    architecture specific base type of T, then the contents of the register are
+    retrieved as if a ``DW_OP_deref_type DR`` operation was performed where DR
+    is the offset of a hypothetical debug information entry in the current
+    compilation unit for T. The resulting value V s pushed on the stack.
 
     *Using* ``DW_OP_reg*`` *provides a more compact form for the case where the
     value was in a register on entry to the subprogram.*
 
-    .. note:
+    .. note::
 
       It is unclear how this provides a more compact expression, as
       ``DW_OP_regval_type`` could be used which is marginally larger.
@@ -1621,14 +1798,20 @@
 
 .. _amdgpu-dwarf-location-description-operations:
 
-Location Description Operations
-###############################
+A.2.5.4.4 Location Description Operations
+#########################################
 
 This section describes the operations that push location descriptions on the
 stack.
 
-General Location Description Operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. _amdgpu-dwarf-general-location-description-operations:
+
+A.2.5.4.4.1 General Location Description Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  This section replaces part of DWARF Version 5 section 2.5.1.3.
 
 1.  ``DW_OP_LLVM_offset`` *New*
 
@@ -1687,15 +1870,33 @@
     expression evaluation.*
 
     *This operation provides explicit functionality (especially for arrays
-    involving descriptions) that is analogous to the implicit push of the base
-    location description of a structure prior to evaluation of a
-    ``DW_AT_data_member_location`` to access a data member of a structure.*
+    involving descriptors) that is analogous to the implicit push of the base
+    location description of a structure prior to evaluation of a*
+    ``DW_AT_data_member_location`` *to access a data member of a structure.*
 
     .. note::
 
       This operation could be removed and the object location description
       specified as the initial stack as for ``DW_AT_data_member_location``.
 
+      Or this operation could be used instead of needing to specify an initial
+      stack. The latter approach is more composable as access to the object may
+      be needed at any point of the expression, and passing it as the initial
+      stack requires the entire expression to be aware where on the stack it is.
+      If this were done, ``DW_AT_use_location`` would require a
+      ``DW_OP_push_object2_address`` operation for the second object.
+
+      Or a more general way to pass an arbitrary number of arguments in and an
+      operation to get the Nth one such as ``DW_OP_arg N``. A vector of
+      arguments would then be passed in the expression context rather than an
+      initial stack. This could also resolve the issues with ``DW_OP_call*`` by
+      allowing a specific number of arguments passed in and returned to be
+      specified. The ``DW_OP_call*`` operation could then always execute on a
+      separate stack: the number of arguments would be specified in a new call
+      operation and taken from the callers stack, and similarly the number of
+      return results specified and copied from the called stack back to the
+      callee stack when the called expression was complete.
+
       The only attribute that specifies a current object is
       ``DW_AT_data_location`` so the non-normative text seems to overstate how
       this is being used. Or are there other attributes that need to state they
@@ -1717,8 +1918,12 @@
 
 .. _amdgpu-dwarf-undefined-location-description-operations:
 
-Undefined Location Description Operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A.2.5.4.4.2 Undefined Location Description Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  This section replaces DWARF Version 5 section 2.6.1.1.1.
 
 *The undefined location storage represents a piece or all of an object that is
 present in the source but not in the object code (perhaps due to optimization).
@@ -1739,8 +1944,13 @@
 
 .. _amdgpu-dwarf-memory-location-description-operations:
 
-Memory Location Description Operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A.2.5.4.4.3 Memory Location Description Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  This section replaces parts of DWARF Version 5 section 2.5.1.1, 2.5.1.2,
+  2.5.1.3, and 2.6.1.1.2.
 
 Each of the target architecture specific address spaces has a corresponding
 memory location storage that denotes the linear addressable memory of that
@@ -1796,10 +2006,9 @@
     description L with a one memory location description SL. If the type size of
     V is less than the generic type size, then the value V is zero extended to
     the size of the generic type. The least significant generic type size bits
-    are treated as a twos-complement unsigned value to be used as an address A.
-    SL specifies memory location storage corresponding to the target
-    architecture default address space with a bit offset equal to A scaled by 8
-    (the byte size).
+    are treated as an unsigned value to be used as an address A. SL specifies
+    memory location storage corresponding to the target architecture default
+    address space with a bit offset equal to A scaled by 8 (the byte size).
 
   The implicit conversion could also be defined as target architecture specific.
   For example, GDB checks if V is an integral type. If it is not it gives an
@@ -1812,7 +2021,7 @@
 pointer value IPV with the target architecture default address space, then it is
 implicitly converted to a location description with one single location
 description specified by IPV. See
-:ref:`amdgpu-dwarf-implicit-location-descriptions`.
+:ref:`amdgpu-dwarf-implicit-location-description-operations`.
 
 .. note::
 
@@ -1869,8 +2078,8 @@
     The address size S is defined as the address bit size of the target
     architecture specific address space that corresponds to AS.
 
-    A is adjusted to S bits by zero extending if necessary, and then treating the
-    least significant S bits as a twos-complement unsigned value A'.
+    A is adjusted to S bits by zero extending if necessary, and then treating
+    the least significant S bits as an unsigned value A'.
 
     It pushes a location description L with one memory location description SL
     on the stack. SL specifies the memory location storage LS that corresponds
@@ -1890,8 +2099,8 @@
     The DWARF expression is ill-formed if AS is not one of the values defined by
     the target architecture specific ``DW_ASPACE_*`` values.
 
-    See :ref:`amdgpu-dwarf-implicit-location-descriptions` for special rules
-    concerning implicit pointer values produced by dereferencing implicit
+    See :ref:`amdgpu-dwarf-implicit-location-description-operations` for special
+    rules concerning implicit pointer values produced by dereferencing implicit
     location descriptions created by the ``DW_OP_implicit_pointer`` and
     ``DW_OP_LLVM_implicit_aspace_pointer`` operations.
 
@@ -1950,7 +2159,7 @@
     The location description L for the *frame base* of the current subprogram is
     obtained from the ``DW_AT_frame_base`` attribute of the debugger information
     entry corresponding to the current subprogram as described in
-    :ref:`amdgpu-dwarf-debugging-information-entry-attributes`.
+    :ref:`amdgpu-dwarf-low-level-information`.
 
     The location description L is updated as if the ``DW_OP_LLVM_offset_uconst
     B`` operation was applied. The updated L is pushed on the stack.
@@ -2010,10 +2219,14 @@
       Could also consider adding ``DW_OP_aspace_breg0, DW_OP_aspace_breg1, ...,
       DW_OP_aspace_bref31`` which would save encoding size.
 
-.. _amdgpu-dwarf-register-location-descriptions:
+.. _amdgpu-dwarf-register-location-description-operations:
+
+A.2.5.4.4.4 Register Location Description Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
 
-Register Location Description Operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  This section replaces DWARF Version 5 section 2.6.1.1.3.
 
 There is a register location storage that corresponds to each of the target
 architecture registers. The size of each register location storage corresponds
@@ -2062,10 +2275,14 @@
 ``DW_OP_breg*`` *register-based addressing operations, or use* ``DW_OP_deref*``
 *on a register location description.*
 
-.. _amdgpu-dwarf-implicit-location-descriptions:
+.. _amdgpu-dwarf-implicit-location-description-operations:
 
-Implicit Location Description Operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A.2.5.4.4.5 Implicit Location Description Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  This section replaces DWARF Version 5 section 2.6.1.1.4.
 
 Implicit location storage represents a piece or all of an object which has no
 actual location in the program but whose contents are nonetheless known, either
@@ -2103,8 +2320,8 @@
     location description specifies the actual value of the object, rather than
     specifying the memory or register storage that holds the value.*
 
-    See :ref:`amdgpu-dwarf-implicit-location-descriptions` for special rules
-    concerning implicit pointer values produced by dereferencing implicit
+    See :ref:`amdgpu-dwarf-implicit-location-description-operations` for special
+    rules concerning implicit pointer values produced by dereferencing implicit
     location descriptions created by the ``DW_OP_implicit_pointer`` and
     ``DW_OP_LLVM_implicit_aspace_pointer`` operations.
 
@@ -2218,7 +2435,7 @@
     *The restrictions on how an implicit pointer location description created
     by* ``DW_OP_implicit_pointer`` *and* ``DW_OP_LLVM_aspace_implicit_pointer``
     *can be used are to simplify the DWARF consumer. Similarly, for an implicit
-    pointer value created by* ``DW_OP_deref*`` *and* ``DW_OP_stack_value``\ .*
+    pointer value created by* ``DW_OP_deref*`` *and* ``DW_OP_stack_value``\ *.*
 
 4.  ``DW_OP_LLVM_aspace_implicit_pointer`` *New*
 
@@ -2259,13 +2476,17 @@
 ``DW_AT_location`` *or* ``DW_AT_const_value`` *attribute (for example,*
 ``DW_TAG_dwarf_procedure``\ *). By using E*\ :sub:`2`\ *, a consumer can
 reconstruct the value of the object when asked to dereference the pointer
-described by E*\ :sub:`1` *which contains the* ``DW_OP_implicit_pointer`` or
+described by E*\ :sub:`1` *which contains the* ``DW_OP_implicit_pointer`` *or*
 ``DW_OP_LLVM_aspace_implicit_pointer`` *operation.*
 
 .. _amdgpu-dwarf-composite-location-description-operations:
 
-Composite Location Description Operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A.2.5.4.4.6 Composite Location Description Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  This section replaces DWARF Version 5 section 2.6.1.2.
 
 A composite location storage represents an object or value which may be
 contained in part of another location storage or contained in parts of more
@@ -2480,8 +2701,12 @@
 
 .. _amdgpu-dwarf-location-list-expressions:
 
-DWARF Location List Expressions
-+++++++++++++++++++++++++++++++
+A.2.5.5 DWARF Location List Expressions
++++++++++++++++++++++++++++++++++++++++
+
+.. note::
+
+  This section replaces DWARF Version 5 section 2.6.2.
 
 *To meet the needs of recent computer architectures and optimization techniques,
 debugging information must be able to describe the location of an object whose
@@ -2573,10 +2798,10 @@
 
 A location list expression can only be used as the value of a debugger
 information entry attribute that is encoded using class ``loclist`` or
-``loclistsptr`` (see DWARF Version 5 section 7.5.5). The value of the attribute
-provides an index into a separate object file section called ``.debug_loclists``
-or ``.debug_loclists.dwo`` (for split DWARF object files) that contains the
-location list entries.
+``loclistsptr`` (see :ref:`amdgpu-dwarf-classes-and-forms`). The value of the
+attribute provides an index into a separate object file section called
+``.debug_loclists`` or ``.debug_loclists.dwo`` (for split DWARF object files)
+that contains the location list entries.
 
 A ``DW_OP_call*`` and ``DW_OP_implicit_pointer`` operation can be used to
 specify a debugger information entry attribute that has a location list
@@ -2596,8 +2821,8 @@
 
 .. _amdgpu-dwarf-segment_addresses:
 
-Segmented Addresses
-~~~~~~~~~~~~~~~~~~~
+A.2.12 Segmented Addresses
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
@@ -2798,69 +3023,106 @@
   operations may be needed. The legal casts between address classes may need to
   be defined on a per language address class basis.
 
-.. _amdgpu-dwarf-debugging-information-entry-attributes:
-
-Debugging Information Entry Attributes
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A.3 Program Scope Entries
+-------------------------
 
 .. note::
 
   This section provides changes to existing debugger information entry
-  attributes and defines attributes added by these extensions. These would be
-  incorporated into the appropriate DWARF Version 5 chapter 2 sections.
+  attributes. These would be incorporated into the corresponding DWARF Version 5
+  chapter 3 sections.
 
-1.  ``DW_AT_location``
+A.3.1 Unit Entries
+~~~~~~~~~~~~~~~~~~
 
-    Any debugging information entry describing a data object (which includes
-    variables and parameters) or common blocks may have a ``DW_AT_location``
-    attribute, whose value is a DWARF expression E.
+.. _amdgpu-dwarf-full-and-partial-compilation-unit-entries:
 
-    The result of the attribute is obtained by evaluating E with a context that
-    has a result kind of a location description, an unspecified object, the
-    compilation unit that contains E, an empty initial stack, and other context
-    elements corresponding to the source language thread of execution upon which
-    the user is focused, if any. The result of the evaluation is the location
-    description of the base of the data object.
+A.3.1.1 Full and Partial Compilation Unit Entries
++++++++++++++++++++++++++++++++++++++++++++++++++
 
-    See :ref:`amdgpu-dwarf-control-flow-operations` for special evaluation rules
-    used by the ``DW_OP_call*`` operations.
+.. note::
 
-    .. note::
+  This augments DWARF Version 5 section 3.1.1 and Table 3.1.
 
-      Delete the description of how the ``DW_OP_call*`` operations evaluate a
-      ``DW_AT_location`` attribute as that is now described in the operations.
+Additional language codes defined for use with the ``DW_AT_language`` attribute
+are defined in :ref:`amdgpu-dwarf-language-names-table`.
 
-    .. note::
+.. table:: Language Names
+   :name: amdgpu-dwarf-language-names-table
 
-      See the discussion about the ``DW_AT_location`` attribute in the
-      ``DW_OP_call*`` operation. Having each attribute only have a single
-      purpose and single execution semantics seems desirable. It makes it easier
-      for the consumer that no longer have to track the context. It makes it
-      easier for the producer as it can rely on a single semantics for each
-      attribute.
+   ==================== =============================
+   Language Name        Meaning
+   ==================== =============================
+   ``DW_LANG_LLVM_HIP`` HIP Language.
+   ==================== =============================
 
-      For that reason, limiting the ``DW_AT_location`` attribute to only
-      supporting evaluating the location description of an object, and using a
-      different attribute and encoding class for the evaluation of DWARF
-      expression *procedures* on the same operation expression stack seems
-      desirable.
+The HIP language [:ref:`HIP <amdgpu-dwarf-HIP>`] can be supported by extending
+the C++ language.
 
-2.  ``DW_AT_const_value``
+.. note::
 
-    .. note::
+  The following new attribute is added.
 
-      Could deprecate using the ``DW_AT_const_value`` attribute for
-      ``DW_TAG_variable`` or ``DW_TAG_formal_parameter`` debugger information
-      entries that have been optimized to a constant. Instead,
-      ``DW_AT_location`` could be used with a DWARF expression that produces an
-      implicit location description now that any location description can be
-      used within a DWARF expression. This allows the ``DW_OP_call*`` operations
-      to be used to push the location description of any variable regardless of
-      how it is optimized.
+1.  A ``DW_TAG_compile_unit`` debugger information entry for a compilation unit
+    may have a ``DW_AT_LLVM_augmentation`` attribute, whose value is an
+    augmentation string.
 
-3.  ``DW_AT_frame_base``
+    *The augmentation string allows producers to indicate that there is
+    additional vendor or target specific information in the debugging
+    information entries. For example, this might be information about the
+    version of vendor specific extensions that are being used.*
 
-    A ``DW_TAG_subprogram`` or ``DW_TAG_entry_point`` debugger information entry
+    If not present, or if the string is empty, then the compilation unit has no
+    augmentation string.
+
+    The format for the augmentation string is:
+
+      | ``[``\ *vendor*\ ``:v``\ *X*\ ``.``\ *Y*\ [\ ``:``\ *options*\ ]\ ``]``\ *
+
+    Where *vendor* is the producer, ``vX.Y`` specifies the major X and minor Y
+    version number of the extensions used, and *options* is an optional string
+    providing additional information about the extensions. The version number
+    must conform to semantic versioning [:ref:`SEMVER <amdgpu-dwarf-SEMVER>`].
+    The *options* string must not contain the "\ ``]``\ " character.
+
+    For example:
+
+      ::
+
+        [abc:v0.0][def:v1.2:feature-a=on,feature-b=3]
+
+A.3.3 Subroutine and Entry Point Entries
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. _amdgpu-dwarf-low-level-information:
+
+A.3.3.5 Low-Level Information
++++++++++++++++++++++++++++++
+
+1.  A ``DW_TAG_subprogram``, ``DW_TAG_inlined_subroutine``, or
+    ``DW_TAG_entry_point`` debugger information entry may have a
+    ``DW_AT_return_addr`` attribute, whose value is a DWARF expression E.
+
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an unspecified object, the
+    compilation unit that contains E, an empty initial stack, and other context
+    elements corresponding to the source language thread of execution upon which
+    the user is focused, if any. The result of the evaluation is the location
+    description L of the place where the return address for the current call
+    frame's subprogram or entry point is stored.
+
+    The DWARF is ill-formed if L is not comprised of one memory location
+    description for one of the target architecture specific address spaces.
+
+    .. note::
+
+      It is unclear why ``DW_TAG_inlined_subroutine`` has a
+      ``DW_AT_return_addr`` attribute but not a ``DW_AT_frame_base`` or
+      ``DW_AT_static_link`` attribute. Seems it would either have all of them or
+      none. Since inlined subprograms do not have a call frame it seems they
+      would have none of these attributes.
+
+2.  A ``DW_TAG_subprogram`` or ``DW_TAG_entry_point`` debugger information entry
     may have a ``DW_AT_frame_base`` attribute, whose value is a DWARF expression
     E.
 
@@ -2874,12 +3136,12 @@
     resulting location description L is not comprised of one single location
     description SL.
 
-    If SL a register location description for register R, then L is replaced
+    If SL is a register location description for register R, then L is replaced
     with the result of evaluating a ``DW_OP_bregx R, 0`` operation. This
     computes the frame base memory location description in the target
     architecture default address space.
 
-    *This allows the more compact* ``DW_OPreg*`` *to be used instead of*
+    *This allows the more compact* ``DW_OP_reg*`` *to be used instead of*
     ``DW_OP_breg* 0``\ *.*
 
     .. note::
@@ -2897,120 +3159,7 @@
     *Typically, E will use the* ``DW_OP_call_frame_cfa`` *operation or be a
     stack pointer register plus or minus some offset.*
 
-4.  ``DW_AT_data_member_location``
-
-    For a ``DW_AT_data_member_location`` attribute there are two cases:
-
-    1.  If the attribute is an integer constant B, it provides the offset in
-        bytes from the beginning of the containing entity.
-
-        The result of the attribute is obtained by evaluating a
-        ``DW_OP_LLVM_offset B`` operation with an initial stack comprising the
-        location description of the beginning of the containing entity.  The
-        result of the evaluation is the location description of the base of the
-        member entry.
-
-        *If the beginning of the containing entity is not byte aligned, then the
-        beginning of the member entry has the same bit displacement within a
-        byte.*
-
-    2.  Otherwise, the attribute must be a DWARF expression E which is evaluated
-        with a context that has a result kind of a location description, an
-        unspecified object, the compilation unit that contains E, an initial
-        stack comprising the location description of the beginning of the
-        containing entity, and other context elements corresponding to the
-        source language thread of execution upon which the user is focused, if
-        any. The result of the evaluation is the location description of the
-        base of the member entry.
-
-    .. note::
-
-      The beginning of the containing entity can now be any location
-      description, including those with more than one single location
-      description, and those with single location descriptions that are of any
-      kind and have any bit offset.
-
-5.  ``DW_AT_use_location``
-
-    The ``DW_TAG_ptr_to_member_type`` debugging information entry has a
-    ``DW_AT_use_location`` attribute whose value is a DWARF expression E. It is
-    used to compute the location description of the member of the class to which
-    the pointer to member entry points.
-
-    *The method used to find the location description of a given member of a
-    class, structure, or union is common to any instance of that class,
-    structure, or union and to any instance of the pointer to member type. The
-    method is thus associated with the pointer to member type, rather than with
-    each object that has a pointer to member type.*
-
-    The ``DW_AT_use_location`` DWARF expression is used in conjunction with the
-    location description for a particular object of the given pointer to member
-    type and for a particular structure or class instance.
-
-    The result of the attribute is obtained by evaluating E with a context that
-    has a result kind of a location description, an unspecified object, the
-    compilation unit that contains E, an initial stack comprising two entries,
-    and other context elements corresponding to the source language thread of
-    execution upon which the user is focused, if any. The first stack entry is
-    the value of the pointer to member object itself. The second stack entry is
-    the location description of the base of the entire class, structure, or
-    union instance containing the member whose location is being calculated. The
-    result of the evaluation is the location description of the member of the
-    class to which the pointer to member entry points.
-
-6.  ``DW_AT_data_location``
-
-    The ``DW_AT_data_location`` attribute may be used with any type that
-    provides one or more levels of hidden indirection and/or run-time parameters
-    in its representation. Its value is a DWARF operation expression E which
-    computes the location description of the data for an object. When this
-    attribute is omitted, the location description of the data is the same as
-    the location description of the object.
-
-    The result of the attribute is obtained by evaluating E with a context that
-    has a result kind of a location description, an object that is the location
-    description of the data descriptor, the compilation unit that contains E, an
-    empty initial stack, and other context elements corresponding to the source
-    language thread of execution upon which the user is focused, if any. The
-    result of the evaluation is the location description of the base of the
-    member entry.
-
-    *E will typically involve an operation expression that begins with a*
-    ``DW_OP_push_object_address`` *operation which loads the location
-    description of the object which can then serve as a description in
-    subsequent calculation.*
-
-    .. note::
-
-      Since ``DW_AT_data_member_location``, ``DW_AT_use_location``, and
-      ``DW_AT_vtable_elem_location`` allow both operation expressions and
-      location list expressions, why does ``DW_AT_data_location`` not allow
-      both? In all cases they apply to data objects so less likely that
-      optimization would cause different operation expressions for different
-      program location ranges. But if supporting for some then should be for
-      all.
-
-      It seems odd this attribute is not the same as
-      ``DW_AT_data_member_location`` in having an initial stack with the
-      location description of the object since the expression has to need it.
-
-7.  ``DW_AT_vtable_elem_location``
-
-    An entry for a virtual function also has a ``DW_AT_vtable_elem_location``
-    attribute whose value is a DWARF expression E.
-
-    The result of the attribute is obtained by evaluating E with a context that
-    has a result kind of a location description, an unspecified object, the
-    compilation unit that contains E, an initial stack comprising the location
-    description of the object of the enclosing type, and other context elements
-    corresponding to the source language thread of execution upon which the user
-    is focused, if any. The result of the evaluation is the location description
-    of the slot for the function within the virtual function table for the
-    enclosing class.
-
-8.  ``DW_AT_static_link``
-
-    If a ``DW_TAG_subprogram`` or ``DW_TAG_entry_point`` debugger information
+3.  If a ``DW_TAG_subprogram`` or ``DW_TAG_entry_point`` debugger information
     entry is lexically nested, it may have a ``DW_AT_static_link`` attribute,
     whose value is a DWARF expression E.
 
@@ -3027,35 +3176,86 @@
     The DWARF is ill-formed if L is is not comprised of one memory location
     description for one of the target architecture specific address spaces.
 
-9.  ``DW_AT_return_addr``
+    .. note::
+
+      The following new attributes are added.
 
-    A ``DW_TAG_subprogram``, ``DW_TAG_inlined_subroutine``, or
+4.  For languages that are implemented using a SIMD or SIMT execution model, a
+    ``DW_TAG_subprogram``, ``DW_TAG_inlined_subroutine``, or
     ``DW_TAG_entry_point`` debugger information entry may have a
-    ``DW_AT_return_addr`` attribute, whose value is a DWARF expression E.
+    ``DW_AT_LLVM_lanes`` attribute whose value is an integer constant that is
+    the number of lanes per thread. This is the static number of lanes per
+    thread. It is not the dynamic number of lanes with which the thread was
+    initiated, for example, due to smaller or partial work-groups.
+
+    If not present, the default value of 1 is used.
+
+    The DWARF is ill-formed if the value is 0.
+
+5.  For languages that are implemented using a SIMD or SIMT execution model, a
+    ``DW_TAG_subprogram``, ``DW_TAG_inlined_subroutine``, or
+    ``DW_TAG_entry_point`` debugging information entry may have a
+    ``DW_AT_LLVM_lane_pc`` attribute whose value is a DWARF expression E.
 
     The result of the attribute is obtained by evaluating E with a context that
     has a result kind of a location description, an unspecified object, the
     compilation unit that contains E, an empty initial stack, and other context
     elements corresponding to the source language thread of execution upon which
-    the user is focused, if any. The result of the evaluation is the location
-    description L of the place where the return address for the current call
-    frame's subprogram or entry point is stored.
+    the user is focused, if any.
 
-    The DWARF is ill-formed if L is not comprised of one memory location
-    description for one of the target architecture specific address spaces.
+    The resulting location description L is for a thread lane count sized vector
+    of generic type elements. The thread lane count is the value of the
+    ``DW_AT_LLVM_lanes`` attribute. Each element holds the conceptual program
+    location of the corresponding lane, where the least significant element
+    corresponds to the first target architecture specific lane identifier and so
+    forth. If the lane was not active when the current subprogram was called,
+    its element is an undefined location description.
 
-    .. note::
+    ``DW_AT_LLVM_lane_pc`` *allows the compiler to indicate conceptually where
+    each lane of a SIMT thread is positioned even when it is in divergent
+    control flow that is not active.*
 
-      It is unclear why ``DW_TAG_inlined_subroutine`` has a
-      ``DW_AT_return_addr`` attribute but not a ``DW_AT_frame_base`` or
-      ``DW_AT_static_link`` attribute. Seems it would either have all of them or
-      none. Since inlined subprograms do not have a call frame it seems they
-      would have none of these attributes.
+    *Typically, the result is a location description with one composite location
+    description with each part being a location description with either one
+    undefined location description or one memory location description.*
+
+    If not present, the thread is not being used in a SIMT manner, and the
+    thread's current program location is used.
+
+6.  For languages that are implemented using a SIMD or SIMT execution model, a
+    ``DW_TAG_subprogram``, ``DW_TAG_inlined_subroutine``, or
+    ``DW_TAG_entry_point`` debugger information entry may have a
+    ``DW_AT_LLVM_active_lane`` attribute whose value is a DWARF expression E.
+
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a value, an unspecified object, the compilation unit
+    that contains E, an empty initial stack, and other context elements
+    corresponding to the source language thread of execution upon which the user
+    is focused, if any.
 
-10. ``DW_AT_call_value``, ``DW_AT_call_data_location``, and
-    ``DW_AT_call_data_value``
+    The DWARF is ill-formed if the resulting value V is not an integral value.
 
-    A ``DW_TAG_call_site_parameter`` debugger information entry may have a
+    The resulting V is a bit mask of active lanes for the current program
+    location. The N\ :sup:`th` least significant bit of the mask corresponds to
+    the N\ :sup:`th` lane. If the bit is 1 the lane is active, otherwise it is
+    inactive.
+
+    *Some targets may update the target architecture execution mask for regions
+    of code that must execute with different sets of lanes than the current
+    active lanes. For example, some code must execute with all lanes made
+    temporarily active.* ``DW_AT_LLVM_active_lane`` *allows the compiler to
+    provide the means to determine the source language active lanes.*
+
+    If not present and ``DW_AT_LLVM_lanes`` is greater than 1, then the target
+    architecture execution mask is used.
+
+A.3.4 Call Site Entries and Parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A.3.4.2 Call Site Parameters
+++++++++++++++++++++++++++++
+
+1.  A ``DW_TAG_call_site_parameter`` debugger information entry may have a
     ``DW_AT_call_value`` attribute, whose value is a DWARF operation expression
     E\ :sub:`1`\ .
 
@@ -3084,6 +3284,13 @@
     :sub:`2` would just be a ``DW_OP_push_object_address``, then the
     ``DW_AT_call_data_location`` attribute may be omitted.
 
+    .. note::
+
+      The DWARF Version 5 implies that `DW_OP_push_object_address` may be used
+      but does not state what object must be specified in the context. Either
+      `DW_OP_push_object_address` cannot be used, or the object to be passed in
+      the context must be defined.
+
     The value of the ``DW_AT_call_data_value`` attribute is obtained by
     evaluating E\ :sub:`3` with a context that has a result kind of a value, an
     unspecified object, the compilation unit that contains E, an empty initial
@@ -3092,11 +3299,11 @@
     value V\ :sub:`3` is the value in L\ :sub:`2` at the time of the call made
     by the call site.
 
-    The result of these attributes is undefined if the current call frame is
-    not for the subprogram containing the ``DW_TAG_call_site_parameter``
-    debugger information entry or the current program location is not for the
-    call site containing the ``DW_TAG_call_site_parameter`` debugger information
-    entry in the current call frame.
+    The result of these attributes is undefined if the current call frame is not
+    for the subprogram containing the ``DW_TAG_call_site_parameter`` debugger
+    information entry or the current program location is not for the call site
+    containing the ``DW_TAG_call_site_parameter`` debugger information entry in
+    the current call frame.
 
     *The consumer may have to virtually unwind to the call site (see*
     :ref:`amdgpu-dwarf-call-frame-information`\ *) in order to evaluate these
@@ -3117,84 +3324,93 @@
     registers that have been clobbered, and clobbered memory will no longer have
     the value at the time of the call.*
 
-11. ``DW_AT_LLVM_lanes`` *New*
+.. _amdgpu-dwarf-lexical-block-entries:
 
-    For languages that are implemented using a SIMD or SIMT execution model, a
-    ``DW_TAG_subprogram``, ``DW_TAG_inlined_subroutine``, or
-    ``DW_TAG_entry_point`` debugger information entry may have a
-    ``DW_AT_LLVM_lanes`` attribute whose value is an integer constant that is
-    the number of lanes per thread. This is the static number of lanes per
-    thread. It is not the dynamic number of lanes with which the thread was
-    initiated, for example, due to smaller or partial work-groups.
+A.3.5 Lexical Block Entries
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    If not present, the default value of 1 is used.
+.. note::
 
-    The DWARF is ill-formed if the value is 0.
+  This section is the same as DWARF Version 5 section 3.5.
 
-12. ``DW_AT_LLVM_lane_pc`` *New*
+A.4 Data Object and Object List Entries
+---------------------------------------
 
-    For languages that are implemented using a SIMD or SIMT execution model, a
-    ``DW_TAG_subprogram``, ``DW_TAG_inlined_subroutine``, or
-    ``DW_TAG_entry_point`` debugging information entry may have a
-    ``DW_AT_LLVM_lane_pc`` attribute whose value is a DWARF expression E.
+.. note::
+
+  This section provides changes to existing debugger information entry
+  attributes. These would be incorporated into the corresponding DWARF Version 5
+  chapter 4 sections.
+
+A.4.1 Data Object Entries
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1.  Any debugging information entry describing a data object (which includes
+    variables and parameters) or common blocks may have a ``DW_AT_location``
+    attribute, whose value is a DWARF expression E.
 
     The result of the attribute is obtained by evaluating E with a context that
     has a result kind of a location description, an unspecified object, the
     compilation unit that contains E, an empty initial stack, and other context
     elements corresponding to the source language thread of execution upon which
-    the user is focused, if any.
+    the user is focused, if any. The result of the evaluation is the location
+    description of the base of the data object.
 
-    The resulting location description L is for a thread lane count sized vector
-    of generic type elements. The thread lane count is the value of the
-    ``DW_AT_LLVM_lanes`` attribute. Each element holds the conceptual program
-    location of the corresponding lane, where the least significant element
-    corresponds to the first target architecture specific lane identifier and so
-    forth. If the lane was not active when the current subprogram was called,
-    its element is an undefined location description.
+    See :ref:`amdgpu-dwarf-control-flow-operations` for special evaluation rules
+    used by the ``DW_OP_call*`` operations.
 
-    ``DW_AT_LLVM_lane_pc`` *allows the compiler to indicate conceptually where
-    each lane of a SIMT thread is positioned even when it is in divergent
-    control flow that is not active.*
+    .. note::
 
-    *Typically, the result is a location description with one composite location
-    description with each part being a location description with either one
-    undefined location description or one memory location description.*
+      Delete the description of how the ``DW_OP_call*`` operations evaluate a
+      ``DW_AT_location`` attribute as that is now described in the operations.
 
-    If not present, the thread is not being used in a SIMT manner, and the
-    thread's current program location is used.
+    .. note::
 
-13. ``DW_AT_LLVM_active_lane`` *New*
+      See the discussion about the ``DW_AT_location`` attribute in the
+      ``DW_OP_call*`` operation. Having each attribute only have a single
+      purpose and single execution semantics seems desirable. It makes it easier
+      for the consumer that no longer have to track the context. It makes it
+      easier for the producer as it can rely on a single semantics for each
+      attribute.
 
-    For languages that are implemented using a SIMD or SIMT execution model, a
-    ``DW_TAG_subprogram``, ``DW_TAG_inlined_subroutine``, or
-    ``DW_TAG_entry_point`` debugger information entry may have a
-    ``DW_AT_LLVM_active_lane`` attribute whose value is a DWARF expression E.
+      For that reason, limiting the ``DW_AT_location`` attribute to only
+      supporting evaluating the location description of an object, and using a
+      different attribute and encoding class for the evaluation of DWARF
+      expression *procedures* on the same operation expression stack seems
+      desirable.
 
-    The result of the attribute is obtained by evaluating E with a context that
-    has a result kind of a value, an unspecified object, the compilation unit
-    that contains E, an empty initial stack, and other context elements
-    corresponding to the source language thread of execution upon which the user
-    is focused, if any.
+2.  ``DW_AT_const_value``
 
-    The DWARF is ill-formed if the resulting value V is not an integral value.
+    .. note::
 
-    The resulting V is a bit mask of active lanes for the current program
-    location. The N\ :sup:`th` least significant bit of the mask corresponds to
-    the N\ :sup:`th` lane. If the bit is 1 the lane is active, otherwise it is
-    inactive.
+      Could deprecate using the ``DW_AT_const_value`` attribute for
+      ``DW_TAG_variable`` or ``DW_TAG_formal_parameter`` debugger information
+      entries that have been optimized to a constant. Instead,
+      ``DW_AT_location`` could be used with a DWARF expression that produces an
+      implicit location description now that any location description can be
+      used within a DWARF expression. This allows the ``DW_OP_call*`` operations
+      to be used to push the location description of any variable regardless of
+      how it is optimized.
 
-    *Some targets may update the target architecture execution mask for regions
-    of code that must execute with different sets of lanes than the current
-    active lanes. For example, some code must execute with all lanes made
-    temporarily active.* ``DW_AT_LLVM_active_lane`` *allows the compiler to
-    provide the means to determine the source language active lanes.*
+A.5 Type Entries
+----------------
 
-    If not present and ``DW_AT_LLVM_lanes`` is greater than 1, then the target
-    architecture execution mask is used.
+.. note::
+
+  This section provides changes to existing debugger information entry
+  attributes. These would be incorporated into the corresponding DWARF Version 5
+  chapter 5 sections.
+
+.. _amdgpu-dwarf-base-type-entries:
+
+A.5.1 Base Type Entries
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. note::
 
-14. ``DW_AT_LLVM_vector_size`` *New*
+  The following new attribute is added.
 
-    A ``DW_TAG_base_type`` debugger information entry for a base type T may have
+1.  A ``DW_TAG_base_type`` debugger information entry for a base type T may have
     a ``DW_AT_LLVM_vector_size`` attribute whose value is an integer constant
     that is the vector type size N.
 
@@ -3215,76 +3431,143 @@
       would not be suitable as the type of a stack value entry. But perhaps that
       could be replaced by using this attribute.
 
-15. ``DW_AT_LLVM_augmentation`` *New*
+A.5.7 Structure, Union, Class and Interface Type Entries
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    A ``DW_TAG_compile_unit`` debugger information entry for a compilation unit
-    may have a ``DW_AT_LLVM_augmentation`` attribute, whose value is an
-    augmentation string.
+A.5.7.3 Derived or Extended Structures, Classes and Interfaces
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-    *The augmentation string allows producers to indicate that there is
-    additional vendor or target specific information in the debugging
-    information entries. For example, this might be information about the
-    version of vendor specific extensions that are being used.*
+1.  For a ``DW_AT_data_member_location`` attribute there are two cases:
 
-    If not present, or if the string is empty, then the compilation unit has no
-    augmentation string.
+    1.  If the attribute is an integer constant B, it provides the offset in
+        bytes from the beginning of the containing entity.
 
-    The format for the augmentation string is:
+        The result of the attribute is obtained by evaluating a
+        ``DW_OP_LLVM_offset B`` operation with an initial stack comprising the
+        location description of the beginning of the containing entity.  The
+        result of the evaluation is the location description of the base of the
+        member entry.
 
-      | ``[``\ *vendor*\ ``:v``\ *X*\ ``.``\ *Y*\ [\ ``:``\ *options*\ ]\ ``]``\ *
+        *If the beginning of the containing entity is not byte aligned, then the
+        beginning of the member entry has the same bit displacement within a
+        byte.*
 
-    Where *vendor* is the producer, ``vX.Y`` specifies the major X and minor Y
-    version number of the extensions used, and *options* is an optional string
-    providing additional information about the extensions. The version number
-    must conform to semantic versioning [:ref:`SEMVER <amdgpu-dwarf-SEMVER>`].
-    The *options* string must not contain the "\ ``]``\ " character.
+    2.  Otherwise, the attribute must be a DWARF expression E which is evaluated
+        with a context that has a result kind of a location description, an
+        unspecified object, the compilation unit that contains E, an initial
+        stack comprising the location description of the beginning of the
+        containing entity, and other context elements corresponding to the
+        source language thread of execution upon which the user is focused, if
+        any. The result of the evaluation is the location description of the
+        base of the member entry.
 
-    For example:
+    .. note::
 
-      ::
+      The beginning of the containing entity can now be any location
+      description, including those with more than one single location
+      description, and those with single location descriptions that are of any
+      kind and have any bit offset.
 
-        [abc:v0.0][def:v1.2:feature-a=on,feature-b=3]
+A.5.7.8 Member Function Entries
++++++++++++++++++++++++++++++++
 
-Program Scope Entities
-----------------------
+1.  An entry for a virtual function also has a ``DW_AT_vtable_elem_location``
+    attribute whose value is a DWARF expression E.
 
-.. _amdgpu-dwarf-language-names:
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an unspecified object, the
+    compilation unit that contains E, an initial stack comprising the location
+    description of the object of the enclosing type, and other context elements
+    corresponding to the source language thread of execution upon which the user
+    is focused, if any. The result of the evaluation is the location description
+    of the slot for the function within the virtual function table for the
+    enclosing class.
 
-Unit Entities
-~~~~~~~~~~~~~
+A.5.14 Pointer to Member Type Entries
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. note::
+1.  The ``DW_TAG_ptr_to_member_type`` debugging information entry has a
+    ``DW_AT_use_location`` attribute whose value is a DWARF expression E. It is
+    used to compute the location description of the member of the class to which
+    the pointer to member entry points.
 
-  This augments DWARF Version 5 section 3.1.1 and Table 3.1.
+    *The method used to find the location description of a given member of a
+    class, structure, or union is common to any instance of that class,
+    structure, or union and to any instance of the pointer to member type. The
+    method is thus associated with the pointer to member type, rather than with
+    each object that has a pointer to member type.*
 
-Additional language codes defined for use with the ``DW_AT_language`` attribute
-are defined in :ref:`amdgpu-dwarf-language-names-table`.
+    The ``DW_AT_use_location`` DWARF expression is used in conjunction with the
+    location description for a particular object of the given pointer to member
+    type and for a particular structure or class instance.
 
-.. table:: Language Names
-   :name: amdgpu-dwarf-language-names-table
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an unspecified object, the
+    compilation unit that contains E, an initial stack comprising two entries,
+    and other context elements corresponding to the source language thread of
+    execution upon which the user is focused, if any. The first stack entry is
+    the value of the pointer to member object itself. The second stack entry is
+    the location description of the base of the entire class, structure, or
+    union instance containing the member whose location is being calculated. The
+    result of the evaluation is the location description of the member of the
+    class to which the pointer to member entry points.
 
-   ==================== =============================
-   Language Name        Meaning
-   ==================== =============================
-   ``DW_LANG_LLVM_HIP`` HIP Language.
-   ==================== =============================
+A.5.16 Dynamic Type Entries
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The HIP language [:ref:`HIP <amdgpu-dwarf-HIP>`] can be supported by extending
-the C++ language.
+1.  The ``DW_AT_data_location`` attribute may be used with any type that
+    provides one or more levels of hidden indirection and/or run-time parameters
+    in its representation. Its value is a DWARF operation expression E which
+    computes the location description of the data for an object. When this
+    attribute is omitted, the location description of the data is the same as
+    the location description of the object.
 
-Other Debugger Information
---------------------------
+    The result of the attribute is obtained by evaluating E with a context that
+    has a result kind of a location description, an object that is the location
+    description of the data descriptor, the compilation unit that contains E, an
+    empty initial stack, and other context elements corresponding to the source
+    language thread of execution upon which the user is focused, if any. The
+    result of the evaluation is the location description of the base of the
+    member entry.
 
-Accelerated Access
-~~~~~~~~~~~~~~~~~~
+    *E will typically involve an operation expression that begins with a*
+    ``DW_OP_push_object_address`` *operation which loads the location
+    description of the object which can then serve as a descriptor in subsequent
+    calculation.*
+
+    .. note::
+
+      Since ``DW_AT_data_member_location``, ``DW_AT_use_location``, and
+      ``DW_AT_vtable_elem_location`` allow both operation expressions and
+      location list expressions, why does ``DW_AT_data_location`` not allow
+      both? In all cases they apply to data objects so less likely that
+      optimization would cause different operation expressions for different
+      program location ranges. But if supporting for some then should be for
+      all.
+
+      It seems odd this attribute is not the same as
+      ``DW_AT_data_member_location`` in having an initial stack with the
+      location description of the object since the expression has to need it.
+
+A.6 Other Debugging Information
+-------------------------------
+
+.. note::
+
+  This section provides changes to existing debugger information entry
+  attributes. These would be incorporated into the corresponding DWARF Version 5
+  chapter 6 sections.
+
+A.6.1 Accelerated Access
+~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. _amdgpu-dwarf-lookup-by-name:
 
-Lookup By Name
-++++++++++++++
+A.6.1.1 Lookup By Name
+++++++++++++++++++++++
 
-Contents of the Name Index
-##########################
+A.6.1.1.1 Contents of the Name Index
+####################################
 
 .. note::
 
@@ -3304,11 +3587,14 @@
   or ``DW_OP_form_tls_address`` operation are included; otherwise, they are
   excluded.
 
-Data Representation of the Name Index
-#####################################
+A.6.1.1.4 Data Representation of the Name Index
+###############################################
 
-Section Header
-^^^^^^^^^^^^^^
+.. _amdgpu-dwarf-name-index-section-header:
+
+
+A.6.1.1.4.1 Section Header
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. note::
 
@@ -3342,14 +3628,14 @@
 
 .. _amdgpu-dwarf-line-number-information:
 
-Line Number Information
-~~~~~~~~~~~~~~~~~~~~~~~
+A.6.2 Line Number Information
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The Line Number Program Header
-++++++++++++++++++++++++++++++
+A.6.2.4 The Line Number Program Header
+++++++++++++++++++++++++++++++++++++++
 
-Standard Content Descriptions
-#############################
+A.6.2.4.1 Standard Content Descriptions
+#######################################
 
 .. note::
 
@@ -3392,8 +3678,8 @@
 
 .. _amdgpu-dwarf-call-frame-information:
 
-Call Frame Information
-~~~~~~~~~~~~~~~~~~~~~~
+A.6.4 Call Frame Information
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
@@ -3403,12 +3689,12 @@
   location description, including those with composite and implicit location
   descriptions.
 
-  These changes would be incorporated into the DWARF Version 5 section 6.1.
+  These changes would be incorporated into the DWARF Version 5 section 6.4.
 
 .. _amdgpu-dwarf-structure_of-call-frame-information:
 
-Structure of Call Frame Information
-+++++++++++++++++++++++++++++++++++
+A.6.4.1 Structure of Call Frame Information
++++++++++++++++++++++++++++++++++++++++++++
 
 The register rules are:
 
@@ -3682,8 +3968,8 @@
 
 .. _amdgpu-dwarf-call-frame-instructions:
 
-Call Frame Instructions
-+++++++++++++++++++++++
+A.6.4.2 Call Frame Instructions
++++++++++++++++++++++++++++++++
 
 Some call frame instructions have operands that are encoded as DWARF operation
 expressions E (see :ref:`amdgpu-dwarf-operation-expressions`). The DWARF
@@ -3720,8 +4006,8 @@
 
 .. _amdgpu-dwarf-row-creation-instructions:
 
-Row Creation Instructions
-#########################
+A.6.4.2.1 Row Creation Instructions
+###################################
 
 .. note::
 
@@ -3729,8 +4015,8 @@
 
 .. _amdgpu-dwarf-cfa-definition-instructions:
 
-CFA Definition Instructions
-###########################
+A.6.4.2.2 CFA Definition Instructions
+#####################################
 
 1.  ``DW_CFA_def_cfa``
 
@@ -3748,7 +4034,7 @@
     displacement B. AS is set to the target architecture default address space
     identifier. The required action is to define the current CFA rule to be the
     result of evaluating the DWARF operation expression ``DW_OP_constu AS;
-    DW_OP_aspace_bregx R, B*data_alignment_factor`` as a location description.
+    DW_OP_aspace_bregx R, B * data_alignment_factor`` as a location description.
 
     *The action is the same as* ``DW_CFA_def_cfa``\ *, except that the second
     operand is signed and factored.*
@@ -3773,7 +4059,7 @@
     architecture specific address space identifier AS. The required action is to
     define the current CFA rule to be the result of evaluating the DWARF
     operation expression ``DW_OP_constu AS; DW_OP_aspace_bregx R,
-    B*data_alignment_factor`` as a location description.
+    B * data_alignment_factor`` as a location description.
 
     If AS is not one of the values defined by the target architecture specific
     ``DW_ASPACE_*`` values, then the DWARF expression is ill-formed.
@@ -3810,9 +4096,9 @@
     The ``DW_CFA_def_cfa_offset_sf`` instruction takes a signed LEB128 operand
     representing a factored byte displacement B. The required action is to
     define the current CFA rule to be the result of evaluating the DWARF
-    operation expression ``DW_OP_constu AS; DW_OP_aspace_bregx R,
-    B*data_alignment_factor`` as a location description. R and AS are the old
-    CFA register number and address space respectively.
+    operation expression ``DW_OP_constu AS; DW_OP_aspace_bregx R, B *
+    data_alignment_factor`` as a location description. R and AS are the old CFA
+    register number and address space respectively.
 
     If the subprogram has no current CFA rule, or the rule was defined by a
     ``DW_CFA_def_cfa_expression`` instruction, then the DWARF is ill-formed.
@@ -3837,8 +4123,8 @@
 
 .. _amdgpu-dwarf-register-rule-instructions:
 
-Register Rule Instructions
-##########################
+A.6.4.2.3 Register Rule Instructions
+####################################
 
 1.  ``DW_CFA_undefined``
 
@@ -3857,7 +4143,7 @@
     The ``DW_CFA_offset`` instruction takes two operands: a register number R
     (encoded with the opcode) and an unsigned LEB128 constant representing a
     factored displacement B. The required action is to change the rule for the
-    register specified by R to be an *offset(B\*data_alignment_factor)* rule.
+    register specified by R to be an *offset(B \* data_alignment_factor)* rule.
 
     .. note::
 
@@ -3888,7 +4174,7 @@
     The ``DW_CFA_val_offset`` instruction takes two unsigned LEB128 operands
     representing a register number R and a factored displacement B. The required
     action is to change the rule for the register indicated by R to be a
-    *val_offset(B\*data_alignment_factor)* rule.
+    *val_offset(B \* data_alignment_factor)* rule.
 
     .. note::
 
@@ -3958,22 +4244,22 @@
     to ``DW_CFA_restore``, except for the encoding and size of the register
     operand.
 
-Row State Instructions
-######################
+A.6.4.2.4 Row State Instructions
+################################
 
 .. note::
 
   These instructions are the same as in DWARF Version 5 section 6.4.2.4.
 
-Padding Instruction
-###################
+A.6.4.2.5 Padding Instruction
+#############################
 
 .. note::
 
   These instructions are the same as in DWARF Version 5 section 6.4.2.5.
 
-Call Frame Instruction Usage
-++++++++++++++++++++++++++++
+A.6.4.3 Call Frame Instruction Usage
+++++++++++++++++++++++++++++++++++++
 
 .. note::
 
@@ -3981,53 +4267,45 @@
 
 .. _amdgpu-dwarf-call-frame-calling-address:
 
-Call Frame Calling Address
-++++++++++++++++++++++++++
+A.6.4.4 Call Frame Calling Address
+++++++++++++++++++++++++++++++++++
 
 .. note::
 
   The same as in DWARF Version 5 section 6.4.4.
 
-Data Representation
--------------------
+A.7 Data Representation
+-----------------------
+
+.. note::
+
+  This section provides changes to existing debugger information entry
+  attributes. These would be incorporated into the corresponding DWARF Version 5
+  chapter 7 sections.
 
 .. _amdgpu-dwarf-32-bit-and-64-bit-dwarf-formats:
 
-32-Bit and 64-Bit DWARF Formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A.7.4 32-Bit and 64-Bit DWARF Formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
-  This augments DWARF Version 5 section 7.4.
-
-1.  Within the body of the ``.debug_info`` section, certain forms of attribute
-    value depend on the choice of DWARF format as follows. For the 32-bit DWARF
-    format, the value is a 4-byte unsigned integer; for the 64-bit DWARF format,
-    the value is an 8-byte unsigned integer.
-
-    .. table:: ``.debug_info`` section attribute form roles
-      :name: amdgpu-dwarf-debug-info-section-attribute-form-roles-table
-
-      ================================== ===================================
-      Form                               Role
-      ================================== ===================================
-      DW_FORM_line_strp                  offset in ``.debug_line_str``
-      DW_FORM_ref_addr                   offset in ``.debug_info``
-      DW_FORM_sec_offset                 offset in a section other than
-                                         ``.debug_info`` or ``.debug_str``
-      DW_FORM_strp                       offset in ``.debug_str``
-      DW_FORM_strp_sup                   offset in ``.debug_str`` section of
-                                         supplementary object file
-      DW_OP_call_ref                     offset in ``.debug_info``
-      DW_OP_implicit_pointer             offset in ``.debug_info``
-      DW_OP_LLVM_aspace_implicit_pointer offset in ``.debug_info``
-      ================================== ===================================
-
-Format of Debugging Information
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Attribute Encodings
-+++++++++++++++++++
+  This augments DWARF Version 5 section 7.4 list item 3's table.
+
+.. table:: ``.debug_info`` section attribute form roles
+  :name: amdgpu-dwarf-debug-info-section-attribute-form-roles-table
+
+  ================================== ===================================
+  Form                               Role
+  ================================== ===================================
+  DW_OP_LLVM_aspace_implicit_pointer offset in ``.debug_info``
+  ================================== ===================================
+
+A.7.5 Format of Debugging Information
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A.7.5.4 Attribute Encodings
++++++++++++++++++++++++++++
 
 .. note::
 
@@ -4049,16 +4327,25 @@
    DW_AT_LLVM_vector_size             0x3e0c constant
    ================================== ====== ===================================
 
-DWARF Expressions
-~~~~~~~~~~~~~~~~~
+.. _amdgpu-dwarf-classes-and-forms:
+
+A.7.5.5 Classes and Forms
++++++++++++++++++++++++++
+
+.. note::
+
+  The same as in DWARF Version 5 section 7.5.5.
+
+A.7.7 DWARF Expressions
+~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
   Rename DWARF Version 5 section 7.7 to reflect the unification of location
   descriptions into DWARF expressions.
 
-Operation Expressions
-+++++++++++++++++++++
+A.7.7.1 Operation Expressions
++++++++++++++++++++++++++++++
 
 .. note::
 
@@ -4096,16 +4383,16 @@
                                                      ULEB128 count
    ================================== ===== ======== ===============================
 
-Location List Expressions
-+++++++++++++++++++++++++
+A.7.7.3 Location List Expressions
++++++++++++++++++++++++++++++++++
 
 .. note::
 
   Rename DWARF Version 5 section 7.7.3 to reflect that location lists are a kind
   of DWARF expression.
 
-Source Languages
-~~~~~~~~~~~~~~~~
+A.7.12 Source Languages
+~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
@@ -4122,8 +4409,8 @@
    ``DW_LANG_LLVM_HIP`` 0x8100 0
    ==================== ====== ===================
 
-Address Class and Address Space Encodings
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A.7.13 Address Class and Address Space Encodings
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
@@ -4147,8 +4434,8 @@
    ``DW_ADDR_LLVM_hi_user``   0xffff
    ========================== ======
 
-Line Number Information
-~~~~~~~~~~~~~~~~~~~~~~~
+A.7.22 Line Number Information
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
@@ -4167,8 +4454,8 @@
   ``DW_LNCT_LLVM_is_MD5``               0x2002
   ====================================  ====================
 
-Call Frame Information
-~~~~~~~~~~~~~~~~~~~~~~
+A.7.24 Call Frame Information
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. note::
 
@@ -4188,8 +4475,8 @@
    DW_CFA_LLVM_def_aspace_cfa_sf 0      0x31   ULEB128 register SLEB128 offset   ULEB128 address space
    ============================= ====== ====== ================ ================ =====================
 
-Attributes by Tag Value (Informative)
--------------------------------------
+A. Attributes by Tag Value (Informative)
+----------------------------------------
 
 .. note::
 
@@ -4219,8 +4506,8 @@
 
 .. _amdgpu-dwarf-examples:
 
-Examples
-========
+B. Examples
+===========
 
 The AMD GPU specific usage of the features in these extensions, including
 examples, is available at *User Guide for AMDGPU Backend* section
@@ -4235,65 +4522,69 @@
 
 .. _amdgpu-dwarf-references:
 
-References
-==========
+C. References
+=============
 
     .. _amdgpu-dwarf-AMD:
 
 1.  [AMD] `Advanced Micro Devices <https://www.amd.com/>`__
 
+    .. _amdgpu-dwarf-AMD-ROCgdb:
+
+2.  [AMD-ROCgdb] `AMD ROCm Debugger (ROCgdb) <https://github.com/ROCm-Developer-Tools/ROCgdb>`__
+
     .. _amdgpu-dwarf-AMD-ROCm:
 
-2.  [AMD-ROCm] `AMD ROCm Platform <https://rocm-documentation.readthedocs.io>`__
+3.  [AMD-ROCm] `AMD ROCm Platform <https://rocm-documentation.readthedocs.io>`__
 
-    .. _amdgpu-dwarf-AMD-ROCgdb:
+    .. _amdgpu-dwarf-AMDGPU-DWARF-LOC:
 
-3.  [AMD-ROCgdb] `AMD ROCm Debugger (ROCgdb) <https://github.com/ROCm-Developer-Tools/ROCgdb>`__
+4.  [AMDGPU-DWARF-LOC] `Allow Location Descriptions on the DWARF Expression Stack <https://llvm.org/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.html>`__
 
     .. _amdgpu-dwarf-AMDGPU-LLVM:
 
-4.  [AMDGPU-LLVM] `User Guide for AMDGPU LLVM Backend <https://llvm.org/docs/AMDGPUUsage.html>`__
+5.  [AMDGPU-LLVM] `User Guide for AMDGPU LLVM Backend <https://llvm.org/docs/AMDGPUUsage.html>`__
 
     .. _amdgpu-dwarf-CUDA:
 
-5.  [CUDA] `Nvidia CUDA Language <https://docs.nvidia.com/cuda/cuda-c-programming-guide/>`__
+6.  [CUDA] `Nvidia CUDA Language <https://docs.nvidia.com/cuda/cuda-c-programming-guide/>`__
 
     .. _amdgpu-dwarf-DWARF:
 
-6.  [DWARF] `DWARF Debugging Information Format <http://dwarfstd.org/>`__
+7.  [DWARF] `DWARF Debugging Information Format <http://dwarfstd.org/>`__
 
     .. _amdgpu-dwarf-ELF:
 
-7.  [ELF] `Executable and Linkable Format (ELF) <http://www.sco.com/developers/gabi/>`__
+8.  [ELF] `Executable and Linkable Format (ELF) <http://www.sco.com/developers/gabi/>`__
 
     .. _amdgpu-dwarf-GCC:
 
-8.  [GCC] `GCC: The GNU Compiler Collection <https://www.gnu.org/software/gcc/>`__
+9.  [GCC] `GCC: The GNU Compiler Collection <https://www.gnu.org/software/gcc/>`__
 
     .. _amdgpu-dwarf-GDB:
 
-9.  [GDB] `GDB: The GNU Project Debugger <https://www.gnu.org/software/gdb/>`__
+10. [GDB] `GDB: The GNU Project Debugger <https://www.gnu.org/software/gdb/>`__
 
     .. _amdgpu-dwarf-HIP:
 
-10. [HIP] `HIP Programming Guide <https://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/Programming-Guides.html#hip-programing-guide>`__
+11. [HIP] `HIP Programming Guide <https://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/Programming-Guides.html#hip-programing-guide>`__
 
     .. _amdgpu-dwarf-HSA:
 
-11. [HSA] `Heterogeneous System Architecture (HSA) Foundation <http://www.hsafoundation.com/>`__
+12. [HSA] `Heterogeneous System Architecture (HSA) Foundation <http://www.hsafoundation.com/>`__
 
     .. _amdgpu-dwarf-LLVM:
 
-12. [LLVM] `The LLVM Compiler Infrastructure <https://llvm.org/>`__
+13. [LLVM] `The LLVM Compiler Infrastructure <https://llvm.org/>`__
 
     .. _amdgpu-dwarf-OpenCL:
 
-13. [OpenCL] `The OpenCL Specification Version 2.0 <http://www.khronos.org/registry/cl/specs/opencl-2.0.pdf>`__
+14. [OpenCL] `The OpenCL Specification Version 2.0 <http://www.khronos.org/registry/cl/specs/opencl-2.0.pdf>`__
 
     .. _amdgpu-dwarf-Perforce-TotalView:
 
-14. [Perforce-TotalView] `Perforce TotalView HPC Debugging Software <https://totalview.io/products/totalview>`__
+15. [Perforce-TotalView] `Perforce TotalView HPC Debugging Software <https://totalview.io/products/totalview>`__
 
     .. _amdgpu-dwarf-SEMVER:
 
-15. [SEMVER] `Semantic Versioning <https://semver.org/>`__
+16. [SEMVER] `Semantic Versioning <https://semver.org/>`__
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2016,9 +2016,10 @@
 -------------------------------------
 
 This section describes how certain debugger information entry attributes are
-used by AMDGPU. See the sections in DWARF Version 5 section 2 which are updated
-by *DWARF Extensions For Heterogeneous Debugging* section
-:ref:`amdgpu-dwarf-debugging-information-entry-attributes`.
+used by AMDGPU. See the sections in DWARF Version 5 section 3.3.5 and 3.1.1
+which are updated by *DWARF Extensions For Heterogeneous Debugging* section
+:ref:`amdgpu-dwarf-low-level-information` and
+:ref:`amdgpu-dwarf-full-and-partial-compilation-unit-entries`.
 
 .. _amdgpu-dwarf-dw-at-llvm-lane-pc:
 
diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst
--- a/llvm/docs/CommandGuide/llvm-profdata.rst
+++ b/llvm/docs/CommandGuide/llvm-profdata.rst
@@ -185,6 +185,13 @@
  inlined by PGO early inliner and it will not be adjusted based on sample
  profile.
 
+.. option:: -debug-info=path
+
+ Specify the executable or ``.dSYM`` that contains debug info for the raw profile.
+ When ``-debug-info-correlate`` was used for instrumentation, use this option
+ to correlate the raw profile.
+
+
 EXAMPLES
 ^^^^^^^^
 Basic Usage
@@ -197,7 +204,7 @@
 
 Weighted Input
 ++++++++++++++
-The input file `foo.profdata` is especially important, multiply its counts by 10:
+The input file ``foo.profdata`` is especially important, multiply its counts by 10:
 
 ::
 
diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst
--- a/llvm/docs/DeveloperPolicy.rst
+++ b/llvm/docs/DeveloperPolicy.rst
@@ -793,10 +793,40 @@
   nuisance to other targets and be considered a candidate for deprecation and
   ultimately removed.
 
-In essences, these rules are necessary for targets to gain and retain their
+In essence, these rules are necessary for targets to gain and retain their
 status, but also markers to define bit-rot, and will be used to clean up the
 tree from unmaintained targets.
 
+Those wishing to add a new target to LLVM must follow the procedure below:
+
+1. Read this section and make sure your target follows all requirements. For
+   minor issues, your community will be responsible for making all necessary
+   adjustments soon after the initial merge.
+2. Send a request for comment (RFC) to the llvm-dev@ mailing list, describing
+   your target and how it follows all the requirements and what work has been
+   done and will need to be done to accommodate the official target requirements.
+   Make sure to expose any and all controversial issues, changes needed in the
+   base code, table gen, etc.
+3. Once the response is positive, the LLVM community can start reviewing the
+   actual patches (but they can be prepared before, to support the RFC). Create
+   a sequence of N patches, numbered '1/N' to 'N/N' (make sure N is an actual
+   number, not the letter 'N'), that completes the basic structure of the target.
+4. The initial patch should add documentation, code owners and triple support in
+   clang and LLVM. The following patches add TableGen infrastructure to describe
+   the target and lower instructions to assembly. The final patch must show that
+   the target can lower correctly with extensive LIT tests (IR to MIR, MIR to
+   ASM, etc).
+5. Some patches may be approved before others, but only after *all* patches are
+   approved that the whole set can be merged in one go. This is to guarantee
+   that all changes are good as a single block.
+6. After the initial merge, the target community can stop numbering patches and
+   start working asynchronously on the target to complete support. They should
+   still seek review from those who helped them in the initial phase, to make
+   sure the progress is still consistent.
+7. Once all official requirements have been fulfilled (as above), the code owner
+   should request the target to be enabled by default by sending another RFC to
+   the llvm-dev@ mailing list.
+
 Adding an Established Project To the LLVM Monorepo
 --------------------------------------------------
 
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -1102,7 +1102,7 @@
  * \param Length      Length of the address operation array.
  */
 LLVMMetadataRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Builder,
-                                              int64_t *Addr, size_t Length);
+                                              uint64_t *Addr, size_t Length);
 
 /**
  * Create a new descriptor for the specified variable that does not have an
@@ -1112,7 +1112,7 @@
  */
 LLVMMetadataRef
 LLVMDIBuilderCreateConstantValueExpression(LLVMDIBuilderRef Builder,
-                                           int64_t Value);
+                                           uint64_t Value);
 
 /**
  * Create a new descriptor for the specified variable.
diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -141,7 +141,7 @@
     template <typename U, typename A>
     ArrayRef(const std::vector<U *, A> &Vec,
              std::enable_if_t<std::is_convertible<U *const *, T const *>::value>
-                 * = 0)
+                 * = nullptr)
         : Data(Vec.data()), Length(Vec.size()) {}
 
     /// @}
diff --git a/llvm/include/llvm/ADT/ImmutableMap.h b/llvm/include/llvm/ADT/ImmutableMap.h
--- a/llvm/include/llvm/ADT/ImmutableMap.h
+++ b/llvm/include/llvm/ADT/ImmutableMap.h
@@ -264,7 +264,7 @@
       : Root(X.getRootWithoutRetain()), Factory(F.getTreeFactory()) {}
 
   static inline ImmutableMapRef getEmptyMap(FactoryTy *F) {
-    return ImmutableMapRef(0, F);
+    return ImmutableMapRef(nullptr, F);
   }
 
   void manualRetain() {
@@ -345,7 +345,7 @@
   ///  which key is the highest in the ordering of keys in the map.  This
   ///  method returns NULL if the map is empty.
   value_type* getMaxElement() const {
-    return Root ? &(Root->getMaxElement()->getValue()) : 0;
+    return Root ? &(Root->getMaxElement()->getValue()) : nullptr;
   }
 
   //===--------------------------------------------------===//
diff --git a/llvm/include/llvm/ADT/ilist.h b/llvm/include/llvm/ADT/ilist.h
--- a/llvm/include/llvm/ADT/ilist.h
+++ b/llvm/include/llvm/ADT/ilist.h
@@ -103,7 +103,7 @@
   template <size_t N> struct SFINAE {};
 
   template <class U>
-  static Yes &test(U *I, decltype(I->getNext(&make<NodeT>())) * = 0);
+  static Yes &test(U *I, decltype(I->getNext(&make<NodeT>())) * = nullptr);
   template <class> static No &test(...);
 
 public:
@@ -117,7 +117,7 @@
   typedef char No[2];
 
   template <class U>
-  static Yes &test(U *I, decltype(I->createSentinel()) * = 0);
+  static Yes &test(U *I, decltype(I->createSentinel()) * = nullptr);
   template <class> static No &test(...);
 
 public:
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -555,7 +555,8 @@
 
   /// Determine the possible constant range of an integer or vector of integer
   /// value. This is intended as a cheap, non-recursive check.
-  ConstantRange computeConstantRange(const Value *V, bool UseInstrInfo = true,
+  ConstantRange computeConstantRange(const Value *V, bool ForSigned,
+                                     bool UseInstrInfo = true,
                                      AssumptionCache *AC = nullptr,
                                      const Instruction *CtxI = nullptr,
                                      const DominatorTree *DT = nullptr,
diff --git a/llvm/include/llvm/Debuginfod/HTTPClient.h b/llvm/include/llvm/Debuginfod/HTTPClient.h
--- a/llvm/include/llvm/Debuginfod/HTTPClient.h
+++ b/llvm/include/llvm/Debuginfod/HTTPClient.h
@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_HTTP_CLIENT_H
-#define LLVM_SUPPORT_HTTP_CLIENT_H
+#ifndef LLVM_DEBUGINFOD_HTTPCLIENT_H
+#define LLVM_DEBUGINFOD_HTTPCLIENT_H
 
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -116,4 +116,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_SUPPORT_HTTP_CLIENT_H
+#endif // LLVM_DEBUGINFOD_HTTPCLIENT_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebuggerSupportPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/DebuggerSupportPlugin.h
--- a/llvm/include/llvm/ExecutionEngine/Orc/DebuggerSupportPlugin.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/DebuggerSupportPlugin.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORT_H
-#define LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORT_H
+#ifndef LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORTPLUGIN_H
+#define LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORTPLUGIN_H
 
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h"
@@ -61,4 +61,4 @@
 } // namespace orc
 } // namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORT_H
+#endif // LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORTPLUGIN_H
diff --git a/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h b/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
--- a/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
+++ b/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
@@ -127,7 +127,7 @@
     JITTargetAddress getTargetAddress() const { return TargetAddress; }
 
   private:
-    const char *ContentPtr = 0;
+    const char *ContentPtr = nullptr;
     uint64_t Size = 0;
     JITTargetAddress TargetAddress = 0;
   };
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPAssume.h b/llvm/include/llvm/Frontend/OpenMP/OMPAssume.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPAssume.h
@@ -0,0 +1,55 @@
+//===- OpenMP/OMPAssume.h --- OpenMP assumption helper functions  - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides helper functions and classes to deal with OpenMP
+/// assumptions, e.g., as used by `[begin/end] assumes` and `assume`.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FRONTEND_OPENMP_OMPASSUME_H
+#define LLVM_FRONTEND_OPENMP_OMPASSUME_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+namespace omp {
+
+/// Helper to describe assume clauses.
+struct AssumptionClauseMappingInfo {
+  /// The identifier describing the (beginning of the) clause.
+  llvm::StringLiteral Identifier;
+  /// Flag to determine if the identifier is a full name or the start of a name.
+  bool StartsWith;
+  /// Flag to determine if a directive lists follows.
+  bool HasDirectiveList;
+  /// Flag to determine if an expression follows.
+  bool HasExpression;
+};
+
+/// All known assume clauses.
+static constexpr AssumptionClauseMappingInfo AssumptionClauseMappings[] = {
+#define OMP_ASSUME_CLAUSE(Identifier, StartsWith, HasDirectiveList,            \
+                          HasExpression)                                       \
+  {Identifier, StartsWith, HasDirectiveList, HasExpression},
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+};
+
+inline std::string getAllAssumeClauseOptions() {
+  std::string S;
+  for (const AssumptionClauseMappingInfo &ACMI : AssumptionClauseMappings)
+    S += (S.empty() ? "'" : "', '") + ACMI.Identifier.str();
+  return S + "'";
+}
+
+} // namespace omp
+
+} // namespace llvm
+
+#endif // LLVM_FRONTEND_OPENMP_OMPASSUME_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -25,7 +25,6 @@
 class ArrayType;
 class StructType;
 class PointerType;
-class StringRef;
 class FunctionType;
 
 namespace omp {
@@ -80,33 +79,6 @@
 #define OMP_IDENT_FLAG(Enum, ...) constexpr auto Enum = omp::IdentFlag::Enum;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
-/// Helper to describe assume clauses.
-struct AssumptionClauseMappingInfo {
-  /// The identifier describing the (beginning of the) clause.
-  llvm::StringLiteral Identifier;
-  /// Flag to determine if the identifier is a full name or the start of a name.
-  bool StartsWith;
-  /// Flag to determine if a directive lists follows.
-  bool HasDirectiveList;
-  /// Flag to determine if an expression follows.
-  bool HasExpression;
-};
-
-/// All known assume clauses.
-static constexpr AssumptionClauseMappingInfo AssumptionClauseMappings[] = {
-#define OMP_ASSUME_CLAUSE(Identifier, StartsWith, HasDirectiveList,            \
-                          HasExpression)                                       \
-  {Identifier, StartsWith, HasDirectiveList, HasExpression},
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-};
-
-inline std::string getAllAssumeClauseOptions() {
-  std::string S;
-  for (const AssumptionClauseMappingInfo &ACMI : AssumptionClauseMappings)
-    S += (S.empty() ? "'" : "', '") + ACMI.Identifier.str();
-  return S + "'";
-}
-
 /// \note This needs to be kept in sync with kmp.h enum sched_type.
 /// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
 ///       To complete this, more enum values will need to be moved here.
@@ -140,6 +112,14 @@
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ OMP_TGT_EXEC_MODE_GENERIC_SPMD)
 };
 
+enum class AddressSpace : unsigned {
+  Generic = 0,
+  Global = 1,
+  Shared = 3,
+  Constant = 4,
+  Local = 5,
+};
+
 } // end namespace omp
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -663,28 +663,31 @@
   Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);
 
   /// Return the (LLVM-IR) string describing the source location \p LocStr.
-  Constant *getOrCreateSrcLocStr(StringRef LocStr);
+  Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize);
 
   /// Return the (LLVM-IR) string describing the default source location.
-  Constant *getOrCreateDefaultSrcLocStr();
+  Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);
 
   /// Return the (LLVM-IR) string describing the source location identified by
   /// the arguments.
   Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
-                                 unsigned Line, unsigned Column);
+                                 unsigned Line, unsigned Column,
+                                 uint32_t &SrcLocStrSize);
 
   /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
   /// fallback if \p DL does not specify the function name.
-  Constant *getOrCreateSrcLocStr(DebugLoc DL, Function *F = nullptr);
+  Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
+                                 Function *F = nullptr);
 
   /// Return the (LLVM-IR) string describing the source location \p Loc.
-  Constant *getOrCreateSrcLocStr(const LocationDescription &Loc);
+  Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
+                                 uint32_t &SrcLocStrSize);
 
   /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
   /// TODO: Create a enum class for the Reserve2Flags
-  Value *getOrCreateIdent(Constant *SrcLocStr,
-                          omp::IdentFlag Flags = omp::IdentFlag(0),
-                          unsigned Reserve2Flags = 0);
+  Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize,
+                             omp::IdentFlag Flags = omp::IdentFlag(0),
+                             unsigned Reserve2Flags = 0);
 
   /// Create a global flag \p Namein the module with initial value \p Value.
   GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
@@ -754,7 +757,7 @@
   StringMap<Constant *> SrcLocStrMap;
 
   /// Map to remember existing ident_t*.
-  DenseMap<std::pair<Constant *, uint64_t>, Value *> IdentMap;
+  DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
 
   /// Helper that contains information about regions we need to outline
   /// during finalization.
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -698,7 +698,6 @@
     /// variable which has a complex address expression for its address.
     /// \param Addr        An array of complex address operations.
     DIExpression *createExpression(ArrayRef<uint64_t> Addr = None);
-    DIExpression *createExpression(ArrayRef<int64_t> Addr);
 
     /// Create an expression for a variable that does not have an address, but
     /// does have a constant value.
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -589,6 +589,9 @@
 inline cst_pred_ty<is_lowbit_mask> m_LowBitMask() {
   return cst_pred_ty<is_lowbit_mask>();
 }
+inline api_pred_ty<is_lowbit_mask> m_LowBitMask(const APInt *&V) {
+  return V;
+}
 
 struct icmp_pred_with_threshold {
   ICmpInst::Predicate Pred;
diff --git a/llvm/include/llvm/LTO/legacy/LTOModule.h b/llvm/include/llvm/LTO/legacy/LTOModule.h
--- a/llvm/include/llvm/LTO/legacy/LTOModule.h
+++ b/llvm/include/llvm/LTO/legacy/LTOModule.h
@@ -41,7 +41,7 @@
     StringRef name;
     uint32_t           attributes = 0;
     bool               isFunction = 0;
-    const GlobalValue *symbol = 0;
+    const GlobalValue *symbol = nullptr;
   };
 
   std::unique_ptr<LLVMContext> OwnedContext;
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -432,7 +432,7 @@
   }
 
   // Return the label of the basic block reached on a transition on \p S.
-  const StringRef getSuccessorLabel(StringRef S) const {
+  StringRef getSuccessorLabel(StringRef S) const {
     assert(Successors.count(S) == 1 && "Expected to find successor.");
     return Successors.find(S)->getValue();
   }
diff --git a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
--- a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
+++ b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
@@ -12,6 +12,7 @@
 #ifndef LLVM_PROFILEDATA_INSTRPROFCORRELATOR_H
 #define LLVM_PROFILEDATA_INSTRPROFCORRELATOR_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
@@ -110,6 +111,7 @@
                           std::unique_ptr<InstrProfCorrelator::Context> Ctx)
       : InstrProfCorrelator(Kind, std::move(Ctx)){};
   std::vector<std::string> Names;
+  llvm::DenseSet<IntPtrT> CounterOffsets;
 
   // Byte-swap the value if necessary.
   template <class T> T maybeSwap(T Value) const {
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -1550,8 +1550,9 @@
   }
 
   template <class T> void addValue(const T &V) {
-    assert(Location != 0 && "cl::location(...) not specified for a command "
-                            "line option with external storage!");
+    assert(Location != nullptr &&
+           "cl::location(...) not specified for a command "
+           "line option with external storage!");
     Location->push_back(V);
   }
 };
@@ -1754,8 +1755,9 @@
   }
 
   template <class T> void addValue(const T &V) {
-    assert(Location != 0 && "cl::location(...) not specified for a command "
-                            "line option with external storage!");
+    assert(Location != nullptr &&
+           "cl::location(...) not specified for a command "
+           "line option with external storage!");
     *Location |= Bit(V);
   }
 
diff --git a/llvm/include/llvm/Support/DivisionByConstantInfo.h b/llvm/include/llvm/Support/DivisionByConstantInfo.h
--- a/llvm/include/llvm/Support/DivisionByConstantInfo.h
+++ b/llvm/include/llvm/Support/DivisionByConstantInfo.h
@@ -10,8 +10,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_DIVISON_BY_CONSTANT_INFO_H
-#define LLVM_SUPPORT_DIVISON_BY_CONSTANT_INFO_H
+#ifndef LLVM_SUPPORT_DIVISIONBYCONSTANTINFO_H
+#define LLVM_SUPPORT_DIVISIONBYCONSTANTINFO_H
 
 #include "llvm/ADT/APInt.h"
 
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -221,7 +221,7 @@
   }
 
   /// Returns the register used as static base in RWPI variants.
-  virtual const MCRegister getStaticBase() const { return MCRegister::NoRegister; }
+  virtual MCRegister getStaticBase() const { return MCRegister::NoRegister; }
 
   /// Get the target specific RWPI relocation.
   virtual const MCExpr *getIndirectSymViaRWPI(const MCSymbol *Sym) const {
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -173,7 +173,8 @@
                                      const Optional<Value *> &B, Type *Ty);
 
 /// Return the initial value of \p Obj with type \p Ty if that is a constant.
-Constant *getInitialValueForObj(Value &Obj, Type &Ty);
+Constant *getInitialValueForObj(Value &Obj, Type &Ty,
+                                const TargetLibraryInfo *TLI);
 
 /// Collect all potential underlying objects of \p Ptr at position \p CtxI in
 /// \p Objects. Assumed information is used and dependences onto \p QueryingAA
@@ -1623,10 +1624,17 @@
   ///
   /// This method will evaluate \p Pred on all (transitive) uses of the
   /// associated value and return true if \p Pred holds every time.
+  /// If uses are skipped in favor of equivalent ones, e.g., if we look through
+  /// memory, the \p EquivalentUseCB will be used to give the caller an idea
+  /// what original used was replaced by a new one (or new ones). The visit is
+  /// cut short if \p EquivalentUseCB returns false and the function will return
+  /// false as well.
   bool checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
                        const AbstractAttribute &QueryingAA, const Value &V,
                        bool CheckBBLivenessOnly = false,
-                       DepClassTy LivenessDepClass = DepClassTy::OPTIONAL);
+                       DepClassTy LivenessDepClass = DepClassTy::OPTIONAL,
+                       function_ref<bool(const Use &OldU, const Use &NewU)>
+                           EquivalentUseCB = nullptr);
 
   /// Emit a remark generically.
   ///
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1248,8 +1248,8 @@
     else
       GCD = APIntOps::GreatestCommonDivisor(GCD, ScaleForGCD.abs());
 
-    ConstantRange CR =
-        computeConstantRange(Index.Val.V, true, &AC, Index.CxtI);
+    ConstantRange CR = computeConstantRange(Index.Val.V, /* ForSigned */ false,
+                                            true, &AC, Index.CxtI);
     KnownBits Known =
         computeKnownBits(Index.Val.V, DL, 0, &AC, Index.CxtI, DT);
     CR = CR.intersectWith(
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2890,7 +2890,8 @@
   if (RHS_CR.isFullSet())
     return ConstantInt::getTrue(ITy);
 
-  ConstantRange LHS_CR = computeConstantRange(LHS, IIQ.UseInstrInfo);
+  ConstantRange LHS_CR =
+      computeConstantRange(LHS, CmpInst::isSigned(Pred), IIQ.UseInstrInfo);
   if (!LHS_CR.isFullSet()) {
     if (RHS_CR.contains(LHS_CR))
       return ConstantInt::getTrue(ITy);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6756,17 +6756,27 @@
 }
 
 static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
-                              APInt &Upper, const InstrInfoQuery &IIQ) {
+                              APInt &Upper, const InstrInfoQuery &IIQ,
+                              bool PreferSignedRange) {
   unsigned Width = Lower.getBitWidth();
   const APInt *C;
   switch (BO.getOpcode()) {
   case Instruction::Add:
     if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) {
-      // FIXME: If we have both nuw and nsw, we should reduce the range further.
-      if (IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(&BO))) {
+      bool HasNSW = IIQ.hasNoSignedWrap(&BO);
+      bool HasNUW = IIQ.hasNoUnsignedWrap(&BO);
+
+      // If the caller expects a signed compare, then try to use a signed range.
+      // Otherwise if both no-wraps are set, use the unsigned range because it
+      // is never larger than the signed range. Example:
+      // "add nuw nsw i8 X, -2" is unsigned [254,255] vs. signed [-128, 125].
+      if (PreferSignedRange && HasNSW && HasNUW)
+        HasNUW = false;
+
+      if (HasNUW) {
         // 'add nuw x, C' produces [C, UINT_MAX].
         Lower = *C;
-      } else if (IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(&BO))) {
+      } else if (HasNSW) {
         if (C->isNegative()) {
           // 'add nsw x, -C' produces [SINT_MIN, SINT_MAX - C].
           Lower = APInt::getSignedMinValue(Width);
@@ -7083,8 +7093,8 @@
   }
 }
 
-ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
-                                         AssumptionCache *AC,
+ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
+                                         bool UseInstrInfo, AssumptionCache *AC,
                                          const Instruction *CtxI,
                                          const DominatorTree *DT,
                                          unsigned Depth) {
@@ -7102,7 +7112,7 @@
   APInt Lower = APInt(BitWidth, 0);
   APInt Upper = APInt(BitWidth, 0);
   if (auto *BO = dyn_cast<BinaryOperator>(V))
-    setLimitsForBinOp(*BO, Lower, Upper, IIQ);
+    setLimitsForBinOp(*BO, Lower, Upper, IIQ, ForSigned);
   else if (auto *II = dyn_cast<IntrinsicInst>(V))
     setLimitsForIntrinsic(*II, Lower, Upper);
   else if (auto *SI = dyn_cast<SelectInst>(V))
@@ -7134,8 +7144,10 @@
       // Currently we just use information from comparisons.
       if (!Cmp || Cmp->getOperand(0) != V)
         continue;
-      ConstantRange RHS = computeConstantRange(Cmp->getOperand(1), UseInstrInfo,
-                                               AC, I, DT, Depth + 1);
+      // TODO: Set "ForSigned" parameter via Cmp->isSigned()?
+      ConstantRange RHS =
+          computeConstantRange(Cmp->getOperand(1), UseInstrInfo,
+                               /* ForSigned */ false, AC, I, DT, Depth + 1);
       CR = CR.intersectWith(
           ConstantRange::makeAllowedICmpRegion(Cmp->getPredicate(), RHS));
     }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3857,7 +3857,8 @@
   for (unsigned i = 0; i != CB->arg_size(); ++i) {
     for (Attribute::AttrKind Kind : {Attribute::ByVal, Attribute::StructRet,
                                      Attribute::InAlloca}) {
-      if (!CB->paramHasAttr(i, Kind))
+      if (!CB->paramHasAttr(i, Kind) ||
+          CB->getParamAttr(i, Kind).getValueAsType())
         continue;
 
       CB->removeParamAttr(i, Kind);
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -857,8 +857,10 @@
   StringRef CompilerVersion = CU->getProducer();
   Version FrontVer = parseVersion(CompilerVersion);
   OS.AddComment("Frontend version");
-  for (int N : FrontVer.Part)
+  for (int N : FrontVer.Part) {
+    N = std::min<int>(N, std::numeric_limits<uint16_t>::max());
     OS.emitInt16(N);
+  }
 
   // Some Microsoft tools, like Binscope, expect a backend version number of at
   // least 8.something, so we'll coerce the LLVM version into a form that
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1189,7 +1189,7 @@
       DefinitionArgs = SP->getType()->getTypeArray();
 
       if (DeclArgs.size() && DefinitionArgs.size())
-        if (DefinitionArgs[0] != NULL && DeclArgs[0] != DefinitionArgs[0])
+        if (DefinitionArgs[0] != nullptr && DeclArgs[0] != DefinitionArgs[0])
           addType(SPDie, DefinitionArgs[0]);
 
       DeclDie = getDIE(SPDecl);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3655,7 +3655,6 @@
     if (!Ty.isVector()) {
       if (!is_contained(NonVecOpIndices, OpIdx))
         return false;
-      is_contained(NonVecOpIndices, OpIdx);
       continue;
     }
 
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -195,12 +195,12 @@
   }
 
   Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
-      : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {}
+      : ErrorMSBs(ErrorMSBs), V(nullptr), B(), A(A) {}
 
   Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
-      : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {}
+      : ErrorMSBs(ErrorMSBs), V(nullptr), B(), A(BitWidth, A) {}
 
-  Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {}
+  Polynomial() : ErrorMSBs((unsigned)-1), V(nullptr), B(), A() {}
 
   /// Increment and clamp the number of undefined bits.
   void incErrorMSBs(unsigned amt) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6438,12 +6438,6 @@
 
   unsigned ShiftAmount = OuterBitSize - InnerBitSize;
   EVT ShiftAmountTy = getShiftAmountTy(VT, DAG.getDataLayout());
-  if (APInt::getMaxValue(ShiftAmountTy.getSizeInBits()).ult(ShiftAmount)) {
-    // FIXME getShiftAmountTy does not always return a sensible result when VT
-    // is an illegal type, and so the type may be too small to fit the shift
-    // amount. Override it with i32. The shift will have to be legalized.
-    ShiftAmountTy = MVT::i32;
-  }
   SDValue Shift = DAG.getConstant(ShiftAmount, dl, ShiftAmountTy);
 
   if (!LH.getNode() && !RH.getNode() &&
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -328,7 +328,7 @@
 
 // Find the FSProfile file name. The internal option takes the precedence
 // before getting from TargetMachine.
-static const std::string getFSProfileFile(const TargetMachine *TM) {
+static std::string getFSProfileFile(const TargetMachine *TM) {
   if (!FSProfileFile.empty())
     return FSProfileFile.getValue();
   const Optional<PGOOptions> &PGOOpt = TM->getPGOOption();
@@ -339,7 +339,7 @@
 
 // Find the Profile remapping file name. The internal option takes the
 // precedence before getting from TargetMachine.
-static const std::string getFSRemappingFile(const TargetMachine *TM) {
+static std::string getFSRemappingFile(const TargetMachine *TM) {
   if (!FSRemappingFile.empty())
     return FSRemappingFile.getValue();
   const Optional<PGOOptions> &PGOOpt = TM->getPGOOption();
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -215,15 +215,16 @@
       OS << "void";
       return DWARFDie();
     }
-    DWARFDie Inner = resolveReferencedType(D);
+    DWARFDie InnerDIE;
+    auto Inner = [&] { return InnerDIE = resolveReferencedType(D); };
     const dwarf::Tag T = D.getTag();
     switch (T) {
     case DW_TAG_pointer_type: {
-      appendPointerLikeTypeBefore(D, Inner, "*");
+      appendPointerLikeTypeBefore(D, Inner(), "*");
       break;
     }
     case DW_TAG_subroutine_type: {
-      appendQualifiedNameBefore(Inner);
+      appendQualifiedNameBefore(Inner());
       if (Word) {
         OS << ' ';
       }
@@ -231,18 +232,18 @@
       break;
     }
     case DW_TAG_array_type: {
-      appendQualifiedNameBefore(Inner);
+      appendQualifiedNameBefore(Inner());
       break;
     }
     case DW_TAG_reference_type:
-      appendPointerLikeTypeBefore(D, Inner, "&");
+      appendPointerLikeTypeBefore(D, Inner(), "&");
       break;
     case DW_TAG_rvalue_reference_type:
-      appendPointerLikeTypeBefore(D, Inner, "&&");
+      appendPointerLikeTypeBefore(D, Inner(), "&&");
       break;
     case DW_TAG_ptr_to_member_type: {
-      appendQualifiedNameBefore(Inner);
-      if (needsParens(Inner))
+      appendQualifiedNameBefore(Inner());
+      if (needsParens(InnerDIE))
         OS << '(';
       else if (Word)
         OS << ' ';
@@ -284,7 +285,7 @@
       const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr);
       if (!NamePtr) {
         appendTypeTagName(D.getTag());
-        return Inner;
+        return DWARFDie();
       }
       Word = true;
       StringRef Name = NamePtr;
@@ -317,7 +318,7 @@
       break;
     }
     }
-    return Inner;
+    return InnerDIE;
   }
 
   void appendUnqualifiedNameAfter(DWARFDie D, DWARFDie Inner,
@@ -801,6 +802,8 @@
   const char *NamePtr = getShortName();
   if (!NamePtr)
     return;
+  if (getTag() == DW_TAG_GNU_template_parameter_pack)
+    return;
   DWARFTypePrinter(OS).appendUnqualifiedName(*this, OriginalFullName);
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -322,12 +322,19 @@
   unsigned NumDebugInfoErrors = 0;
   ReferenceMap CrossUnitReferences;
 
+  unsigned Index = 1;
   for (const auto &Unit : Units) {
-      ReferenceMap UnitLocalReferences;
-      NumDebugInfoErrors +=
-          verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences);
-      NumDebugInfoErrors += verifyDebugInfoReferences(
-          UnitLocalReferences, [&](uint64_t Offset) { return Unit.get(); });
+    OS << "Verifying unit: " << Index << " / " << Units.getNumUnits();
+    if (const char* Name = Unit->getUnitDIE(true).getShortName())
+      OS << ", \"" << Name << '\"';
+    OS << '\n';
+    OS.flush();
+    ReferenceMap UnitLocalReferences;
+    NumDebugInfoErrors +=
+        verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences);
+    NumDebugInfoErrors += verifyDebugInfoReferences(
+        UnitLocalReferences, [&](uint64_t Offset) { return Unit.get(); });
+    ++Index;
   }
 
   NumDebugInfoErrors += verifyDebugInfoReferences(
diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
--- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -33,8 +33,8 @@
 class SourceCode {
   std::unique_ptr<MemoryBuffer> MemBuf;
 
-  const Optional<StringRef> load(StringRef FileName,
-                                 const Optional<StringRef> &EmbeddedSource) {
+  Optional<StringRef> load(StringRef FileName,
+                           const Optional<StringRef> &EmbeddedSource) {
     if (Lines <= 0)
       return None;
 
@@ -50,7 +50,7 @@
     }
   }
 
-  const Optional<StringRef> pruneSource(const Optional<StringRef> &Source) {
+  Optional<StringRef> pruneSource(const Optional<StringRef> &Source) {
     if (!Source)
       return None;
     size_t FirstLinePos = StringRef::npos, Pos = 0;
diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp
--- a/llvm/lib/Debuginfod/Debuginfod.cpp
+++ b/llvm/lib/Debuginfod/Debuginfod.cpp
@@ -36,7 +36,7 @@
 
 Expected<SmallVector<StringRef>> getDefaultDebuginfodUrls() {
   const char *DebuginfodUrlsEnv = std::getenv("DEBUGINFOD_URLS");
-  if (DebuginfodUrlsEnv == NULL)
+  if (DebuginfodUrlsEnv == nullptr)
     return SmallVector<StringRef>();
 
   SmallVector<StringRef> DebuginfodUrls;
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
--- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -611,7 +611,7 @@
       DynamicLibrarySearchGenerator::GetForCurrentProcess(GlobalPrefix, Pred);
 
   if (!ProcessSymsGenerator) {
-    *Result = 0;
+    *Result = nullptr;
     return wrap(ProcessSymsGenerator.takeError());
   }
 
@@ -637,7 +637,7 @@
       DynamicLibrarySearchGenerator::Load(FileName, GlobalPrefix, Pred);
 
   if (!LibrarySymsGenerator) {
-    *Result = 0;
+    *Result = nullptr;
     return wrap(LibrarySymsGenerator.takeError());
   }
 
@@ -657,7 +657,7 @@
     auto LibrarySymsGenerator =
         StaticLibraryDefinitionGenerator::Load(*unwrap(ObjLayer), FileName, TT);
     if (!LibrarySymsGenerator) {
-      *Result = 0;
+      *Result = nullptr;
       return wrap(LibrarySymsGenerator.takeError());
     }
     *Result = wrap(LibrarySymsGenerator->release());
@@ -666,7 +666,7 @@
     auto LibrarySymsGenerator =
         StaticLibraryDefinitionGenerator::Load(*unwrap(ObjLayer), FileName);
     if (!LibrarySymsGenerator) {
-      *Result = 0;
+      *Result = nullptr;
       return wrap(LibrarySymsGenerator.takeError());
     }
     *Result = wrap(LibrarySymsGenerator->release());
@@ -712,7 +712,7 @@
 
   auto JTMB = JITTargetMachineBuilder::detectHost();
   if (!JTMB) {
-    Result = 0;
+    Result = nullptr;
     return wrap(JTMB.takeError());
   }
 
@@ -876,7 +876,7 @@
   LLVMOrcDisposeLLJITBuilder(Builder);
 
   if (!J) {
-    Result = 0;
+    Result = nullptr;
     return wrap(J.takeError());
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -24,7 +24,7 @@
 Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) {
   std::error_code EC;
   auto MB = sys::Memory::allocateMappedMemory(
-      Size, 0, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
+      Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
   if (EC)
     return errorCodeToError(EC);
   std::lock_guard<std::mutex> Lock(M);
diff --git a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
--- a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
@@ -114,11 +114,11 @@
 
   // Copy the address to all the other groups, if they have not
   // been initialized.
-  if (CodeMem.Near.base() == 0)
+  if (CodeMem.Near.base() == nullptr)
     CodeMem.Near = MB;
-  if (RODataMem.Near.base() == 0)
+  if (RODataMem.Near.base() == nullptr)
     RODataMem.Near = MB;
-  if (RWDataMem.Near.base() == 0)
+  if (RWDataMem.Near.base() == nullptr)
     RWDataMem.Near = MB;
 
   // Remember that we allocated this memory
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -21,7 +21,9 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
@@ -37,6 +39,7 @@
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 
+#include <cstdint>
 #include <sstream>
 
 #define DEBUG_TYPE "openmp-ir-builder"
@@ -255,19 +258,21 @@
   return GV;
 }
 
-Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
-                                         IdentFlag LocFlags,
-                                         unsigned Reserve2Flags) {
+Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
+                                            uint32_t SrcLocStrSize,
+                                            IdentFlag LocFlags,
+                                            unsigned Reserve2Flags) {
   // Enable "C-mode".
   LocFlags |= OMP_IDENT_FLAG_KMPC;
 
-  Value *&Ident =
+  Constant *&Ident =
       IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
   if (!Ident) {
     Constant *I32Null = ConstantInt::getNullValue(Int32);
-    Constant *IdentData[] = {
-        I32Null, ConstantInt::get(Int32, uint32_t(LocFlags)),
-        ConstantInt::get(Int32, Reserve2Flags), I32Null, SrcLocStr};
+    Constant *IdentData[] = {I32Null,
+                             ConstantInt::get(Int32, uint32_t(LocFlags)),
+                             ConstantInt::get(Int32, Reserve2Flags),
+                             ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
     Constant *Initializer =
         ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
 
@@ -290,10 +295,12 @@
     }
   }
 
-  return Builder.CreatePointerCast(Ident, IdentPtr);
+  return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
 }
 
-Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
+Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
+                                                uint32_t &SrcLocStrSize) {
+  SrcLocStrSize = LocStr.size();
   Constant *&SrcLocStr = SrcLocStrMap[LocStr];
   if (!SrcLocStr) {
     Constant *Initializer =
@@ -314,8 +321,8 @@
 
 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
                                                 StringRef FileName,
-                                                unsigned Line,
-                                                unsigned Column) {
+                                                unsigned Line, unsigned Column,
+                                                uint32_t &SrcLocStrSize) {
   SmallString<128> Buffer;
   Buffer.push_back(';');
   Buffer.append(FileName);
@@ -327,17 +334,21 @@
   Buffer.append(std::to_string(Column));
   Buffer.push_back(';');
   Buffer.push_back(';');
-  return getOrCreateSrcLocStr(Buffer.str());
+  return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
 }
 
-Constant *OpenMPIRBuilder::getOrCreateDefaultSrcLocStr() {
-  return getOrCreateSrcLocStr(";unknown;unknown;0;0;;");
+Constant *
+OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
+  StringRef UnknownLoc = ";unknown;unknown;0;0;;";
+  return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
 }
 
-Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, Function *F) {
+Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
+                                                uint32_t &SrcLocStrSize,
+                                                Function *F) {
   DILocation *DIL = DL.get();
   if (!DIL)
-    return getOrCreateDefaultSrcLocStr();
+    return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
   StringRef FileName = M.getName();
   if (DIFile *DIF = DIL->getFile())
     if (Optional<StringRef> Source = DIF->getSource())
@@ -346,12 +357,13 @@
   if (Function.empty() && F)
     Function = F->getName();
   return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
-                              DIL->getColumn());
+                              DIL->getColumn(), SrcLocStrSize);
 }
 
-Constant *
-OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) {
-  return getOrCreateSrcLocStr(Loc.DL, Loc.IP.getBlock()->getParent());
+Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
+                                                uint32_t &SrcLocStrSize) {
+  return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
+                              Loc.IP.getBlock()->getParent());
 }
 
 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
@@ -393,9 +405,11 @@
     break;
   }
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Args[] = {getOrCreateIdent(SrcLocStr, BarrierLocFlags),
-                   getOrCreateThreadID(getOrCreateIdent(SrcLocStr))};
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Args[] = {
+      getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
+      getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
 
   // If we are in a cancellable parallel region, barriers are cancellation
   // points.
@@ -441,8 +455,9 @@
     llvm_unreachable("Unknown cancel kind!");
   }
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
   Value *Result = Builder.CreateCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
@@ -513,8 +528,9 @@
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadID = getOrCreateThreadID(Ident);
 
   if (NumThreads) {
@@ -871,8 +887,9 @@
 
 void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
   // Build call void __kmpc_flush(ident_t *loc)
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Args[] = {getOrCreateIdent(SrcLocStr)};
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
 
   Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
 }
@@ -886,8 +903,9 @@
 void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
   // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
   // global_tid);
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
 
   // Ignore return result until untied tasks are supported.
@@ -903,8 +921,9 @@
 
 void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
   // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Constant *I32Null = ConstantInt::getNullValue(Int32);
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
 
@@ -1114,14 +1133,16 @@
   Module *Module = Func->getParent();
   Value *RedArrayPtr =
       Builder.CreateBitCast(RedArray, Builder.getInt8PtrTy(), "red.array.ptr");
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   bool CanGenerateAtomic =
       llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
         return RI.AtomicReductionGen;
       });
-  Value *Ident = getOrCreateIdent(
-      SrcLocStr, CanGenerateAtomic ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
-                                   : IdentFlag(0));
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
+                                  CanGenerateAtomic
+                                      ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
+                                      : IdentFlag(0));
   Value *ThreadId = getOrCreateThreadID(Ident);
   Constant *NumVariables = Builder.getInt32(NumReductions);
   const DataLayout &DL = Module->getDataLayout();
@@ -1235,8 +1256,9 @@
     return Loc.IP;
 
   Directive OMPD = Directive::OMPD_master;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {Ident, ThreadId};
 
@@ -1258,8 +1280,9 @@
     return Loc.IP;
 
   Directive OMPD = Directive::OMPD_masked;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {Ident, ThreadId, Filter};
   Value *ArgsEnd[] = {Ident, ThreadId};
@@ -1480,8 +1503,9 @@
   Builder.restoreIP(CLI->getPreheaderIP());
   Builder.SetCurrentDebugLocation(DL);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(DL);
-  Value *SrcLoc = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
+  Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
 
   // Declare useful OpenMP runtime functions.
   Value *IV = CLI->getIndVar();
@@ -1608,8 +1632,9 @@
   // Set up the source location value for OpenMP runtime.
   Builder.SetCurrentDebugLocation(DL);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(DL);
-  Value *SrcLoc = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
+  Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
 
   // Declare useful OpenMP runtime functions.
   Value *IV = CLI->getIndVar();
@@ -2379,8 +2404,9 @@
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
 
   llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
@@ -2407,8 +2433,9 @@
   }
 
   Directive OMPD = Directive::OMPD_single;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {Ident, ThreadId};
 
@@ -2436,8 +2463,9 @@
     return Loc.IP;
 
   Directive OMPD = Directive::OMPD_critical;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *LockVar = getOMPCriticalRegionLock(CriticalName);
   Value *Args[] = {Ident, ThreadId, LockVar};
@@ -2486,8 +2514,9 @@
   Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
       ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
 
@@ -2512,8 +2541,9 @@
   Instruction *ExitCall = nullptr;
 
   if (IsThreads) {
-    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-    Value *Ident = getOrCreateIdent(SrcLocStr);
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+    Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
     Value *ThreadId = getOrCreateThreadID(Ident);
     Value *Args[] = {Ident, ThreadId};
 
@@ -2718,8 +2748,9 @@
   IRBuilder<>::InsertPointGuard IPG(Builder);
   Builder.restoreIP(Loc.IP);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {ThreadId, Size, Allocator};
 
@@ -2734,8 +2765,9 @@
   IRBuilder<>::InsertPointGuard IPG(Builder);
   Builder.restoreIP(Loc.IP);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {ThreadId, Addr, Allocator};
   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
@@ -2748,8 +2780,9 @@
   IRBuilder<>::InsertPointGuard IPG(Builder);
   Builder.restoreIP(Loc.IP);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Constant *ThreadPrivateCache =
       getOrCreateOMPInternalVariable(Int8PtrPtr, Name);
@@ -2767,8 +2800,9 @@
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   ConstantInt *IsSPMDVal = ConstantInt::getSigned(
       IntegerType::getInt8Ty(Int8->getContext()),
       IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
@@ -2820,8 +2854,9 @@
   if (!updateToLocation(Loc))
     return;
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   ConstantInt *IsSPMDVal = ConstantInt::getSigned(
       IntegerType::getInt8Ty(Int8->getContext()),
       IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1668,8 +1668,8 @@
                  ++i, ++GTI)
               switch (IdxCompare(CE1->getOperand(i),
                                  CE2->getOperand(i), GTI.getIndexedType())) {
-              case -1: return isSigned ? ICmpInst::ICMP_SLT:ICmpInst::ICMP_ULT;
-              case 1:  return isSigned ? ICmpInst::ICMP_SGT:ICmpInst::ICMP_UGT;
+              case -1: return ICmpInst::ICMP_ULT;
+              case 1:  return ICmpInst::ICMP_UGT;
               case -2: return ICmpInst::BAD_ICMP_PREDICATE;
               }
 
@@ -1678,7 +1678,7 @@
             for (; i < CE1->getNumOperands(); ++i)
               if (!CE1->getOperand(i)->isNullValue()) {
                 if (isa<ConstantInt>(CE1->getOperand(i)))
-                  return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+                  return ICmpInst::ICMP_UGT;
                 else
                   return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
               }
@@ -1686,7 +1686,7 @@
             for (; i < CE2->getNumOperands(); ++i)
               if (!CE2->getOperand(i)->isNullValue()) {
                 if (isa<ConstantInt>(CE2->getOperand(i)))
-                  return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+                  return ICmpInst::ICMP_ULT;
                 else
                   return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
               }
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -142,12 +142,12 @@
   if (AttrKind == Attribute::AttrKind::ByVal) {
     // After r362128, byval attributes need to have a type attribute. Provide a
     // NULL one until a proper API is added for this.
-    return wrap(Attribute::getWithByValType(Ctx, NULL));
+    return wrap(Attribute::getWithByValType(Ctx, nullptr));
   }
 
   if (AttrKind == Attribute::AttrKind::StructRet) {
     // Same as byval.
-    return wrap(Attribute::getWithStructRetType(Ctx, NULL));
+    return wrap(Attribute::getWithStructRetType(Ctx, nullptr));
   }
 
   return wrap(Attribute::get(Ctx, AttrKind, Val));
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -821,12 +821,6 @@
   return DIExpression::get(VMContext, Addr);
 }
 
-DIExpression *DIBuilder::createExpression(ArrayRef<int64_t> Signed) {
-  // TODO: Remove the callers of this signed version and delete.
-  SmallVector<uint64_t, 8> Addr(Signed.begin(), Signed.end());
-  return createExpression(Addr);
-}
-
 template <class... Ts>
 static DISubprogram *getSubprogram(bool IsDistinct, Ts &&...Args) {
   if (IsDistinct)
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1436,14 +1436,14 @@
 }
 
 LLVMMetadataRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Builder,
-                                              int64_t *Addr, size_t Length) {
-  return wrap(unwrap(Builder)->createExpression(ArrayRef<int64_t>(Addr,
-                                                                  Length)));
+                                              uint64_t *Addr, size_t Length) {
+  return wrap(
+      unwrap(Builder)->createExpression(ArrayRef<uint64_t>(Addr, Length)));
 }
 
 LLVMMetadataRef
 LLVMDIBuilderCreateConstantValueExpression(LLVMDIBuilderRef Builder,
-                                           int64_t Value) {
+                                           uint64_t Value) {
   return wrap(unwrap(Builder)->createConstantValueExpression(Value));
 }
 
diff --git a/llvm/lib/InterfaceStub/IFSHandler.cpp b/llvm/lib/InterfaceStub/IFSHandler.cpp
--- a/llvm/lib/InterfaceStub/IFSHandler.cpp
+++ b/llvm/lib/InterfaceStub/IFSHandler.cpp
@@ -195,7 +195,7 @@
 }
 
 Error ifs::writeIFSToOutputStream(raw_ostream &OS, const IFSStub &Stub) {
-  yaml::Output YamlOut(OS, NULL, /*WrapColumn =*/0);
+  yaml::Output YamlOut(OS, nullptr, /*WrapColumn =*/0);
   std::unique_ptr<IFSStubTriple> CopyStub(new IFSStubTriple(Stub));
   if (Stub.Target.Arch) {
     CopyStub->Target.ArchString = std::string(
diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp
--- a/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -481,9 +481,9 @@
   typedef std::pair<uint64_t, writeHandler> writeOperation;
   std::vector<writeOperation> WriteQueue;
 
-  MachO::dyld_info_command *DyldInfoOnlyCmd = 0;
-  MachO::symtab_command *SymtabCmd = 0;
-  MachO::dysymtab_command *DSymtabCmd = 0;
+  MachO::dyld_info_command *DyldInfoOnlyCmd = nullptr;
+  MachO::symtab_command *SymtabCmd = nullptr;
+  MachO::dysymtab_command *DSymtabCmd = nullptr;
   for (auto &LC : Obj.LoadCommands) {
     switch (LC.Data.load_command_data.cmd) {
     case MachO::LC_SYMTAB:
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -129,6 +129,7 @@
   correlateProfileDataImpl();
   auto Result =
       collectPGOFuncNameStrings(Names, /*doCompression=*/true, CompressedNames);
+  CounterOffsets.clear();
   Names.clear();
   return Result;
 }
@@ -139,6 +140,9 @@
                                                 IntPtrT CounterOffset,
                                                 IntPtrT FunctionPtr,
                                                 uint32_t NumCounters) {
+  // Check if a probe was already added for this counter offset.
+  if (!CounterOffsets.insert(CounterOffset).second)
+    return;
   Data.push_back({
       maybeSwap<uint64_t>(IndexedInstrProf::ComputeHash(FunctionName)),
       maybeSwap<uint64_t>(CFGHash),
diff --git a/llvm/lib/Remarks/Remark.cpp b/llvm/lib/Remarks/Remark.cpp
--- a/llvm/lib/Remarks/Remark.cpp
+++ b/llvm/lib/Remarks/Remark.cpp
@@ -111,7 +111,7 @@
   ArrayRef<Argument> Args = unwrap(Remark)->Args;
   // No arguments to iterate on.
   if (Args.empty())
-    return NULL;
+    return nullptr;
   return reinterpret_cast<LLVMRemarkArgRef>(
       const_cast<Argument *>(Args.begin()));
 }
@@ -119,13 +119,13 @@
 extern "C" LLVMRemarkArgRef
 LLVMRemarkEntryGetNextArg(LLVMRemarkArgRef ArgIt, LLVMRemarkEntryRef Remark) {
   // No more arguments to iterate on.
-  if (ArgIt == NULL)
-    return NULL;
+  if (ArgIt == nullptr)
+    return nullptr;
 
   auto It = (ArrayRef<Argument>::const_iterator)ArgIt;
   auto Next = std::next(It);
   if (Next == unwrap(Remark)->Args.end())
-    return NULL;
+    return nullptr;
 
   return reinterpret_cast<LLVMRemarkArgRef>(const_cast<Argument *>(Next));
 }
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -83,12 +83,12 @@
 
   StringRef::const_iterator CIP = CPUInfoStart;
 
-  StringRef::const_iterator CPUStart = 0;
+  StringRef::const_iterator CPUStart = nullptr;
   size_t CPULen = 0;
 
   // We need to find the first line which starts with cpu, spaces, and a colon.
   // After the colon, there may be some additional spaces and then the cpu type.
-  while (CIP < CPUInfoEnd && CPUStart == 0) {
+  while (CIP < CPUInfoEnd && CPUStart == nullptr) {
     if (CIP < CPUInfoEnd && *CIP == '\n')
       ++CIP;
 
@@ -118,12 +118,12 @@
       }
     }
 
-    if (CPUStart == 0)
+    if (CPUStart == nullptr)
       while (CIP < CPUInfoEnd && *CIP != '\n')
         ++CIP;
   }
 
-  if (CPUStart == 0)
+  if (CPUStart == nullptr)
     return generic;
 
   return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -273,7 +273,7 @@
     // the program, and not the eventual binary file. Therefore, call realpath
     // so this behaves the same on all platforms.
 #if _POSIX_VERSION >= 200112 || defined(__GLIBC__)
-    if (char *real_path = realpath(exe_path, NULL)) {
+    if (char *real_path = realpath(exe_path, nullptr)) {
       std::string ret = std::string(real_path);
       free(real_path);
       return ret;
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -246,7 +246,7 @@
   MI.getOperand(0).setReg(DefReg);
   ToBeRemoved.insert(&MI);
 
-  LLVM_DEBUG({ dbgs() << "Removed: " << MI << "\n"; });
+  LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
 
   return true;
 }
@@ -259,8 +259,7 @@
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();
 
-  if (!MRI->isSSA())
-    return false;
+  assert (MRI->isSSA() && "Expected to be run on SSA form!");
 
   bool Changed = false;
   SmallSetVector<MachineInstr *, 8> ToBeRemoved;
@@ -278,6 +277,7 @@
         break;
       case AArch64::ORRWrs:
         Changed = visitORR(MI, ToBeRemoved);
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
--- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -633,7 +633,7 @@
 /// Return true when the instruction is processed successfully.
 bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
      unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
-  assert (DefiningMI != NULL);
+  assert(DefiningMI != nullptr);
   if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -623,7 +623,8 @@
   Function *Callee = CI->getCalledFunction();
 
   // Ignore indirect calls.
-  if (Callee == 0) return false;
+  if (Callee == nullptr)
+    return false;
 
   BasicBlock *BB = CI->getParent();
   LLVMContext &Context = CI->getParent()->getContext();
@@ -1402,8 +1403,8 @@
   Function *UCallee = UI->getCalledFunction();
   Type *RetType = UCallee->getReturnType();
   B.SetInsertPoint(&*ItNew);
-  AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
-    std::string(prefix) + UI->getName());
+  AllocaInst *Alloc =
+      B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName());
   Alloc->setAlignment(
       Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
   return Alloc;
@@ -1724,7 +1725,8 @@
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
-      if (Callee == 0) continue;
+      if (Callee == nullptr)
+        continue;
 
       LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
                  dbgs().flush());
@@ -1757,7 +1759,7 @@
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
-      if (Callee == 0)
+      if (Callee == nullptr)
         continue;
 
       LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
@@ -1783,9 +1785,10 @@
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
-      if (Callee == 0) continue;
+      if (Callee == nullptr)
+        continue;
 
-      if(Simplifier.useNative(CI))
+      if (Simplifier.useNative(CI))
         Changed = true;
     }
   }
@@ -1811,7 +1814,7 @@
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
-      if (Callee == 0)
+      if (Callee == nullptr)
         continue;
 
       if (Simplifier.useNative(CI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -463,7 +463,7 @@
             WhatToStore.push_back(Arg);
           }
         } else if (isa<FixedVectorType>(ArgType)) {
-          Type *IType = NULL;
+          Type *IType = nullptr;
           uint32_t EleCount = cast<FixedVectorType>(ArgType)->getNumElements();
           uint32_t EleSize = ArgType->getScalarSizeInBits();
           uint32_t TotalSize = EleCount * EleSize;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1847,7 +1847,8 @@
   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
     return false;
 
-  if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+  if (MI.mayLoad() &&
+      TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
     return false;
 
   if (AnchorList.count(&MI))
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -188,7 +188,7 @@
     unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask);
     unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset);
     unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
-    BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+    BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32))
         .addImm(Value)
         .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
                 (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14682,7 +14682,9 @@
   return SDValue();
 }
 
-// Check that N is CMPZ(CSINC(0, 0, CC, X)), return X if valid.
+// Check that N is CMPZ(CSINC(0, 0, CC, X)),
+//              or CMPZ(CMOV(1, 0, CC, $cpsr, X))
+// return X if valid.
 static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
   if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
     return SDValue();
@@ -14696,12 +14698,24 @@
          CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
     CSInc = CSInc.getOperand(0);
 
-  if (CSInc.getOpcode() != ARMISD::CSINC ||
-      !isNullConstant(CSInc.getOperand(0)) ||
-      !isNullConstant(CSInc.getOperand(1)) || !CSInc->hasOneUse())
-    return SDValue();
-  CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
-  return CSInc.getOperand(3);
+  if (CSInc.getOpcode() == ARMISD::CSINC &&
+      isNullConstant(CSInc.getOperand(0)) &&
+      isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
+    CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
+    return CSInc.getOperand(3);
+  }
+  if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
+      isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
+    CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
+    return CSInc.getOperand(4);
+  }
+  if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
+      isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
+    CC = ARMCC::getOppositeCondition(
+        (ARMCC::CondCodes)CSInc.getConstantOperandVal(2));
+    return CSInc.getOperand(4);
+  }
+  return SDValue();
 }
 
 static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) {
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/llvm/lib/Target/ARM/ARMTargetObjectFile.h
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -24,7 +24,7 @@
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
-  const MCRegister getStaticBase() const override;
+  MCRegister getStaticBase() const override;
 
   const MCExpr *getIndirectSymViaRWPI(const MCSymbol *Sym) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -54,9 +54,7 @@
   }
 }
 
-const MCRegister ARMElfTargetObjectFile::getStaticBase() const {
-  return ARM::R9;
-}
+MCRegister ARMElfTargetObjectFile::getStaticBase() const { return ARM::R9; }
 
 const MCExpr *ARMElfTargetObjectFile::
 getIndirectSymViaRWPI(const MCSymbol *Sym) const {
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -1870,7 +1870,7 @@
   }
 
   template <int shift> bool isMemRegRQOffset() const {
-    if (!isMVEMem() || Memory.OffsetImm != 0 || Memory.Alignment != 0)
+    if (!isMVEMem() || Memory.OffsetImm != nullptr || Memory.Alignment != 0)
       return false;
 
     if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains(
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1123,9 +1123,8 @@
   DenseMap<unsigned, int> RegOffsets;
   int FloatRegCount = 0;
   // Process each .cfi directive and build up compact unwind info.
-  for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+  for (const MCCFIInstruction &Inst : Instrs) {
     unsigned Reg;
-    const MCCFIInstruction &Inst = Instrs[i];
     switch (Inst.getOperation()) {
     case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
       CFARegisterOffset = Inst.getOffset();
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -264,10 +264,8 @@
 void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
                                       const SmallVectorImpl<uint8_t> &Opcodes) {
   OS << "\t.unwind_raw " << Offset;
-  for (SmallVectorImpl<uint8_t>::const_iterator OCI = Opcodes.begin(),
-                                                OCE = Opcodes.end();
-       OCI != OCE; ++OCI)
-    OS << ", 0x" << Twine::utohexstr(*OCI);
+  for (uint8_t Opcode : Opcodes)
+    OS << ", 0x" << Twine::utohexstr(Opcode);
   OS << '\n';
 }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -338,8 +338,8 @@
       {codeview::RegisterId::ARM_NQ14, ARM::Q14},
       {codeview::RegisterId::ARM_NQ15, ARM::Q15},
   };
-  for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
-    MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
+  for (const auto &I : RegMap)
+    MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
 }
 
 static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
--- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -1147,9 +1147,8 @@
   // predecessors.
   ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
   bool Modified = false;
-  for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
-       I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
-    Modified |= ReduceMBB(**I);
+  for (MachineBasicBlock *MBB : RPOT)
+    Modified |= ReduceMBB(*MBB);
   return Modified;
 }
 
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -874,7 +874,8 @@
   // Allow reg+<6bit> offset.
   if (Offs < 0)
     Offs = -Offs;
-  if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 0 && isUInt<6>(Offs)) {
+  if (AM.BaseGV == nullptr && AM.HasBaseReg && AM.Scale == 0 &&
+      isUInt<6>(Offs)) {
     return true;
   }
 
@@ -2012,7 +2013,7 @@
                                                      std::string &Constraint,
                                                      std::vector<SDValue> &Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result(nullptr, 0);
   SDLoc DL(Op);
   EVT Ty = Op.getValueType();
 
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -304,11 +304,11 @@
       }
 
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         UnCondBrIter = MBB.end();
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.h b/llvm/lib/Target/AVR/AVRRegisterInfo.h
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.h
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.h
@@ -27,7 +27,7 @@
 
 public:
   const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override;
   BitVector getReservedRegs(const MachineFunction &MF) const override;
@@ -39,7 +39,7 @@
   /// Stack Frame Processing Methods
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 
   Register getFrameRegister(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -281,7 +281,7 @@
                                   OperandVector const &Operands,
                                   uint64_t const &ErrorInfo) {
   SMLoc ErrorLoc = Loc;
-  char const *Diag = 0;
+  char const *Diag = nullptr;
 
   if (ErrorInfo != ~0U) {
     if (ErrorInfo >= Operands.size()) {
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -1002,7 +1002,7 @@
       VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr or enum value
 
     GV = new GlobalVariable(*M, VarType, false, GlobalVariable::ExternalLinkage,
-                            NULL, AccessKey);
+                            nullptr, AccessKey);
     GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
     GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
     GEPGlobals[AccessKey] = GV;
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -325,7 +325,7 @@
       default: {
         errs() << "LowerFormalArguments Unhandled argument type: "
                << RegVT.getEVTString() << '\n';
-        llvm_unreachable(0);
+        llvm_unreachable(nullptr);
       }
       case MVT::i32:
       case MVT::i64:
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -105,10 +105,10 @@
 
     BasicBlock *BB = Call->getParent();
     IntegerType *VarType = Type::getInt64Ty(BB->getContext());
-    std::string GVName = BaseName + std::to_string(Count) + "$" +
-        std::to_string(Reloc);
+    std::string GVName =
+        BaseName + std::to_string(Count) + "$" + std::to_string(Reloc);
     GlobalVariable *GV = new GlobalVariable(
-        *M, VarType, false, GlobalVariable::ExternalLinkage, NULL, GVName);
+        *M, VarType, false, GlobalVariable::ExternalLinkage, nullptr, GVName);
     GV->addAttribute(BPFCoreSharedInfo::TypeIdAttr);
     GV->setMetadata(LLVMContext::MD_preserve_access_index, MD);
 
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -50,7 +50,7 @@
 
 void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                   raw_ostream &O, const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     O << getRegisterName(Op.getReg());
diff --git a/llvm/lib/Target/Hexagon/BitTracker.cpp b/llvm/lib/Target/Hexagon/BitTracker.cpp
--- a/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -940,8 +940,8 @@
       // If evaluated successfully add the targets to the cumulative list.
       if (Trace) {
         dbgs() << "  adding targets:";
-        for (unsigned i = 0, n = BTs.size(); i < n; ++i)
-          dbgs() << " " << printMBBReference(*BTs[i]);
+        for (const MachineBasicBlock *BT : BTs)
+          dbgs() << " " << printMBBReference(*BT);
         if (FallsThrough)
           dbgs() << "\n  falls through\n";
         else
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -3260,13 +3260,12 @@
       dbgs() << "Group[" << i << "] inp: "
              << printReg(G.Inp.Reg, HRI, G.Inp.Sub)
              << "  out: " << printReg(G.Out.Reg, HRI, G.Out.Sub) << "\n";
-      for (unsigned j = 0, m = G.Ins.size(); j < m; ++j)
-        dbgs() << "  " << *G.Ins[j];
+      for (const MachineInstr *MI : G.Ins)
+        dbgs() << "  " << MI;
     }
   });
 
-  for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
-    InstrGroup &G = Groups[i];
+  for (InstrGroup &G : Groups) {
     if (!isShuffleOf(G.Out.Reg, G.Inp.Reg))
       continue;
     auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
diff --git a/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
--- a/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -118,13 +118,10 @@
     return false;
 
   // Loop over all of the basic blocks.
-  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
-       MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock *MBB = &*MBBb;
-
+  for (MachineBasicBlock &MBB : Fn) {
     // Traverse the basic block.
-    MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
-    if (MII != MBB->end()) {
+    MachineBasicBlock::iterator MII = MBB.getFirstTerminator();
+    if (MII != MBB.end()) {
       MachineInstr &MI = *MII;
       int Opc = MI.getOpcode();
       if (IsConditionalBranch(Opc)) {
@@ -155,17 +152,17 @@
         //   Remove BB2
         //   BB3: ...
         //   BB4: ...
-        unsigned NumSuccs = MBB->succ_size();
-        MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
+        unsigned NumSuccs = MBB.succ_size();
+        MachineBasicBlock::succ_iterator SI = MBB.succ_begin();
         MachineBasicBlock* FirstSucc = *SI;
         MachineBasicBlock* SecondSucc = *(++SI);
         MachineBasicBlock* LayoutSucc = nullptr;
         MachineBasicBlock* JumpAroundTarget = nullptr;
 
-        if (MBB->isLayoutSuccessor(FirstSucc)) {
+        if (MBB.isLayoutSuccessor(FirstSucc)) {
           LayoutSucc = FirstSucc;
           JumpAroundTarget = SecondSucc;
-        } else if (MBB->isLayoutSuccessor(SecondSucc)) {
+        } else if (MBB.isLayoutSuccessor(SecondSucc)) {
           LayoutSucc = SecondSucc;
           JumpAroundTarget = FirstSucc;
         } else {
@@ -201,7 +198,7 @@
 
             if (case1 || case2) {
               InvertAndChangeJumpTarget(MI, UncondTarget);
-              MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
+              MBB.replaceSuccessor(JumpAroundTarget, UncondTarget);
 
               // Remove the unconditional branch in LayoutSucc.
               LayoutSucc->erase(LayoutSucc->begin());
diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -126,16 +126,16 @@
 
   // HVX 128-byte mode
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
       CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
       CCAssignToReg<[W0,W1,W2,W3,W4,W5,W6,W7]>>>,
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
       CCAssignToStack<128,128>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
       CCAssignToStack<256,128>>>,
 
   CCDelegateTo<CC_Hexagon>
@@ -152,10 +152,10 @@
 
   // HVX 128-byte mode
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
       CCAssignToReg<[V0]>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
       CCAssignToReg<[W0]>>>,
 
   CCDelegateTo<RetCC_Hexagon>
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -290,13 +290,11 @@
   raw_ostream &operator<< (raw_ostream &OS,
                            const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED;
   raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){
-    using const_iterator = NodeToUsesMap::const_iterator;
-
-    for (const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
-      const UseSet &Us = I->second;
-      OS << I->first << " -> #" << Us.size() << '{';
-      for (UseSet::const_iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
-        User *R = (*J)->getUser();
+    for (const auto &I : M) {
+      const UseSet &Us = I.second;
+      OS << I.first << " -> #" << Us.size() << '{';
+      for (const Use *U : Us) {
+        User *R = U->getUser();
         if (R->hasName())
           OS << ' ' << R->getName();
         else
@@ -420,15 +418,12 @@
   // instruction that uses another GEP instruction as the base pointer, the
   // gep node for the base pointer should already exist.
   ValueToNodeMap NM;
-  for (ValueVect::iterator I = BO.begin(), E = BO.end(); I != E; ++I) {
-    BasicBlock *B = cast<BasicBlock>(*I);
-    for (BasicBlock::iterator J = B->begin(), F = B->end(); J != F; ++J) {
-      if (!isa<GetElementPtrInst>(J))
-        continue;
-      GetElementPtrInst *GepI = cast<GetElementPtrInst>(J);
-      if (isHandledGepForm(GepI))
-        processGepInst(GepI, NM);
-    }
+  for (Value *I : BO) {
+    BasicBlock *B = cast<BasicBlock>(I);
+    for (Instruction &J : *B)
+      if (auto *GepI = dyn_cast<GetElementPtrInst>(&J))
+        if (isHandledGepForm(GepI))
+          processGepInst(GepI, NM);
   }
 
   LLVM_DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
@@ -436,17 +431,14 @@
 
 static void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM,
                               NodeVect &Roots) {
-    using const_iterator = NodeVect::const_iterator;
-
-    for (const_iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
-      GepNode *N = *I;
-      if (N->Flags & GepNode::Root) {
-        Roots.push_back(N);
-        continue;
-      }
-      GepNode *PN = N->Parent;
-      NCM[PN].push_back(N);
+  for (GepNode *N : Nodes) {
+    if (N->Flags & GepNode::Root) {
+      Roots.push_back(N);
+      continue;
     }
+    GepNode *PN = N->Parent;
+    NCM[PN].push_back(N);
+  }
 }
 
 static void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM,
@@ -546,8 +538,7 @@
   using NodeSetMap = std::map<unsigned, NodeSet>;
   NodeSetMap MaybeEq;
 
-  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
-    GepNode *N = *I;
+  for (GepNode *N : Nodes) {
     unsigned H = node_hash(N);
     MaybeEq[H].insert(N);
   }
@@ -556,9 +547,8 @@
   // one for equality and the other for non-equality.
   NodeSymRel EqRel;  // Equality relation (as set of equivalence classes).
   NodePairSet Eq, Ne;  // Caches.
-  for (NodeSetMap::iterator I = MaybeEq.begin(), E = MaybeEq.end();
-       I != E; ++I) {
-    NodeSet &S = I->second;
+  for (auto &I : MaybeEq) {
+    NodeSet &S = I.second;
     for (NodeSet::iterator NI = S.begin(), NE = S.end(); NI != NE; ++NI) {
       GepNode *N = *NI;
       // If node already has a class, then the class must have been created
@@ -612,8 +602,7 @@
     // Update the min element's flags, and user list.
     uint32_t Flags = 0;
     UseSet &MinUs = Uses[Min];
-    for (NodeSet::iterator J = S.begin(), F = S.end(); J != F; ++J) {
-      GepNode *N = *J;
+    for (GepNode *N : S) {
       uint32_t NF = N->Flags;
       // If N is used, append all original values of N to the list of
       // original values of Min.
@@ -633,8 +622,7 @@
   // selected (minimum) node from the corresponding equivalence class.
   // If a given parent does not have an equivalence class, leave it
   // unchanged (it means that it's the only element in its class).
-  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
-    GepNode *N = *I;
+  for (GepNode *N : Nodes) {
     if (N->Flags & GepNode::Root)
       continue;
     const NodeSet *PC = node_class(N->Parent, EqRel);
@@ -652,8 +640,7 @@
 
   // Finally, erase the nodes that are no longer used.
   NodeSet Erase;
-  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
-    GepNode *N = *I;
+  for (GepNode *N : Nodes) {
     const NodeSet *PC = node_class(N, EqRel);
     if (!PC)
       continue;
@@ -663,7 +650,7 @@
     if (N == F->second)
       continue;
     // Node for removal.
-    Erase.insert(*I);
+    Erase.insert(N);
   }
   erase_if(Nodes, in_set(Erase));
 
@@ -775,8 +762,7 @@
     NodeToUsesMap::iterator UF = Uses.find(Node);
     assert(UF != Uses.end() && "Used node with no use information");
     UseSet &Us = UF->second;
-    for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
-      Use *U = *I;
+    for (Use *U : Us) {
       User *R = U->getUser();
       if (!isa<Instruction>(R))
         continue;
@@ -790,8 +776,7 @@
   NodeChildrenMap::iterator CF = NCM.find(Node);
   if (CF != NCM.end()) {
     NodeVect &Cs = CF->second;
-    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
-      GepNode *CN = *I;
+    for (GepNode *CN : Cs) {
       NodeToValueMap::iterator LF = Loc.find(CN);
       // If the child is only used in GEP instructions (i.e. is not used in
       // non-GEP instructions), the nearest dominator computed for it may
@@ -831,8 +816,8 @@
   NodeChildrenMap::iterator CF = NCM.find(Node);
   if (CF != NCM.end()) {
     NodeVect &Cs = CF->second;
-    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
-      recalculatePlacementRec(*I, NCM, Loc);
+    for (GepNode *C : Cs)
+      recalculatePlacementRec(C, NCM, Loc);
   }
   BasicBlock *LB = recalculatePlacement(Node, NCM, Loc);
   LLVM_DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
@@ -921,8 +906,8 @@
   NodeChildrenMap::iterator CF = NCM.find(Node);
   if (CF != NCM.end()) {
     NodeVect &Cs = CF->second;
-    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
-      adjustForInvariance(*I, NCM, Loc);
+    for (GepNode *C : Cs)
+      adjustForInvariance(C, NCM, Loc);
   }
   return LocB;
 }
@@ -938,10 +923,9 @@
   raw_ostream &operator<< (raw_ostream &OS,
                            const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ;
   raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) {
-    for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end();
-         I != E; ++I) {
-      OS << I->first << " -> ";
-      if (BasicBlock *B = cast_or_null<BasicBlock>(I->second))
+    for (const auto &I : Loc.Map) {
+      OS << I.first << " -> ";
+      if (BasicBlock *B = cast_or_null<BasicBlock>(I.second))
         OS << B->getName() << '(' << B << ')';
       else
         OS << "<null-block>";
@@ -1016,8 +1000,7 @@
   // Collect all used nodes together with the uses from loads and stores,
   // where the GEP node could be folded into the load/store instruction.
   NodeToUsesMap FNs; // Foldable nodes.
-  for (NodeSet::iterator I = Ns.begin(), E = Ns.end(); I != E; ++I) {
-    GepNode *N = *I;
+  for (GepNode *N : Ns) {
     if (!(N->Flags & GepNode::Used))
       continue;
     NodeToUsesMap::iterator UF = Uses.find(N);
@@ -1025,8 +1008,7 @@
     UseSet &Us = UF->second;
     // Loads/stores that use the node N.
     UseSet LSs;
-    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
-      Use *U = *J;
+    for (Use *U : Us) {
       User *R = U->getUser();
       // We're interested in uses that provide the address. It can happen
       // that the value may also be provided via GEP, but we won't handle
@@ -1051,11 +1033,11 @@
 
   LLVM_DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
 
-  for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) {
-    GepNode *N = I->first;
-    UseSet &Us = I->second;
-    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J)
-      separateChainForNode(N, *J, Loc);
+  for (auto &FN : FNs) {
+    GepNode *N = FN.first;
+    UseSet &Us = FN.second;
+    for (Use *U : Us)
+      separateChainForNode(N, U, Loc);
   }
 }
 
@@ -1068,21 +1050,21 @@
 
   // Compute the initial placement determined by the users' locations, and
   // the locations of the child nodes.
-  for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
-    recalculatePlacementRec(*I, NCM, Loc);
+  for (GepNode *Root : Roots)
+    recalculatePlacementRec(Root, NCM, Loc);
 
   LLVM_DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
 
   if (OptEnableInv) {
-    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
-      adjustForInvariance(*I, NCM, Loc);
+    for (GepNode *Root : Roots)
+      adjustForInvariance(Root, NCM, Loc);
 
     LLVM_DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
                       << LocationAsBlock(Loc));
   }
   if (OptEnableConst) {
-    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
-      separateConstantChains(*I, NCM, Loc);
+    for (GepNode *Root : Roots)
+      separateConstantChains(Root, NCM, Loc);
   }
   LLVM_DEBUG(dbgs() << "Node use information:\n" << Uses);
 
@@ -1153,8 +1135,8 @@
       NodeToUsesMap::iterator UF = Uses.find(N);
       assert(UF != Uses.end() && "No use information for used node");
       UseSet &Us = UF->second;
-      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I)
-        Values.push_back((*I)->getUser());
+      for (const auto &U : Us)
+        Values.push_back(U->getUser());
     }
     NodeChildrenMap::iterator CF = NCM.find(N);
     if (CF != NCM.end()) {
@@ -1223,8 +1205,7 @@
     // to the Roots list.
     if (LastCN > 0) {
       NodeVect &Cs = NCM[Last];
-      for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
-        GepNode *CN = *I;
+      for (GepNode *CN : Cs) {
         CN->Flags &= ~GepNode::Internal;
         CN->Flags |= GepNode::Root;
         CN->BaseVal = NewInst;
@@ -1238,10 +1219,8 @@
       NodeToUsesMap::iterator UF = Uses.find(Last);
       assert(UF != Uses.end() && "No use information found");
       UseSet &Us = UF->second;
-      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
-        Use *U = *I;
+      for (Use *U : Us)
         U->set(NewInst);
-      }
     }
   }
 }
@@ -1261,8 +1240,8 @@
     ValueVect Ins;
     for (Instruction &I : llvm::reverse(*B))
       Ins.push_back(&I);
-    for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) {
-      Instruction *In = cast<Instruction>(*I);
+    for (Value *I : Ins) {
+      Instruction *In = cast<Instruction>(I);
       if (isInstructionTriviallyDead(In))
         In->eraseFromParent();
     }
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -125,8 +125,8 @@
     };
 
     LatticeCell() : Kind(Top), Size(0), IsSpecial(false) {
-      for (unsigned i = 0; i < MaxCellSize; ++i)
-        Values[i] = nullptr;
+      for (const Constant *&Value : Values)
+        Value = nullptr;
     }
 
     bool meet(const LatticeCell &L);
@@ -1029,8 +1029,8 @@
           ToRemove.push_back(const_cast<MachineBasicBlock*>(SB));
         Targets.remove(SB);
       }
-      for (unsigned i = 0, n = ToRemove.size(); i < n; ++i)
-        removeCFGEdge(B, ToRemove[i]);
+      for (MachineBasicBlock *MBB : ToRemove)
+        removeCFGEdge(B, MBB);
       // If there are any blocks left in the computed targets, it means that
       // we think that the block could go somewhere, but the CFG does not.
       // This could legitimately happen in blocks that have non-returning
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -612,8 +612,8 @@
   // Simply keep a list of children of B, and traverse that list.
   using DTNodeVectType = SmallVector<MachineDomTreeNode *, 4>;
   DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
-  for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
-    MachineBasicBlock *SB = (*I)->getBlock();
+  for (auto &I : Cn) {
+    MachineBasicBlock *SB = I->getBlock();
     if (!Deleted.count(SB))
       Changed |= visitBlock(SB, L);
   }
@@ -648,8 +648,8 @@
              << "\n");
   bool Changed = false;
   if (L) {
-    for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-      Changed |= visitLoop(*I);
+    for (MachineLoop *I : *L)
+      Changed |= visitLoop(I);
   }
 
   MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN);
@@ -964,8 +964,8 @@
     using DTNodeVectType = SmallVector<MachineDomTreeNode *, 4>;
 
     DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
-    for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
-      MachineBasicBlock *SB = (*I)->getBlock();
+    for (auto &I : Cn) {
+      MachineBasicBlock *SB = I->getBlock();
       MDT->changeImmediateDominator(SB, IDB);
     }
   }
@@ -973,8 +973,8 @@
   while (!B->succ_empty())
     B->removeSuccessor(B->succ_begin());
 
-  for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I)
-    (*I)->removeSuccessor(B, true);
+  for (MachineBasicBlock *Pred : B->predecessors())
+    Pred->removeSuccessor(B, true);
 
   Deleted.insert(B);
   MDT->eraseNode(B);
@@ -1064,8 +1064,8 @@
   Deleted.clear();
   bool Changed = false;
 
-  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
-    Changed |= visitLoop(*I);
+  for (MachineLoop *L : *MLI)
+    Changed |= visitLoop(L);
   Changed |= visitLoop(nullptr);
 
   return Changed;
diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
--- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -1106,8 +1106,7 @@
 }
 
 bool HexagonExpandCondsets::isIntraBlocks(LiveInterval &LI) {
-  for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
-    LiveRange::Segment &LR = *I;
+  for (LiveRange::Segment &LR : LI) {
     // Range must start at a register...
     if (!LR.start.isRegister())
       return false;
@@ -1160,16 +1159,16 @@
   // Move all live segments from L2 to L1.
   using ValueInfoMap = DenseMap<VNInfo *, VNInfo *>;
   ValueInfoMap VM;
-  for (LiveInterval::iterator I = L2.begin(), E = L2.end(); I != E; ++I) {
-    VNInfo *NewVN, *OldVN = I->valno;
+  for (LiveRange::Segment &I : L2) {
+    VNInfo *NewVN, *OldVN = I.valno;
     ValueInfoMap::iterator F = VM.find(OldVN);
     if (F == VM.end()) {
-      NewVN = L1.getNextValue(I->valno->def, LIS->getVNInfoAllocator());
+      NewVN = L1.getNextValue(I.valno->def, LIS->getVNInfoAllocator());
       VM.insert(std::make_pair(OldVN, NewVN));
     } else {
       NewVN = F->second;
     }
-    L1.addSegment(LiveRange::Segment(I->start, I->end, NewVN));
+    L1.addSegment(LiveRange::Segment(I.start, I.end, NewVN));
   }
   while (!L2.empty())
     L2.removeSegment(*L2.begin());
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -416,8 +416,8 @@
   UnsignedMap RPO;
   RPOTType RPOT(&MF);
   unsigned RPON = 0;
-  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
-    RPO[(*I)->getNumber()] = RPON++;
+  for (auto &I : RPOT)
+    RPO[I->getNumber()] = RPON++;
 
   // Don't process functions that have loops, at least for now. Placement
   // of prolog and epilog must take loop structure into account. For simpli-
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -458,6 +458,7 @@
                           SelectionDAG &DAG) const;
 
   SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
@@ -468,7 +469,6 @@
   SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2720,7 +2720,6 @@
 HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
       const {
   if (Ty.isVector()) {
-    assert(Ty.isInteger() && "Only integer vectors are supported here");
     unsigned W = Ty.getSizeInBits();
     if (W <= 64)
       return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W)));
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
old mode 100644
new mode 100755
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -55,6 +55,12 @@
     addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
+    if (Subtarget.useHVXV68Ops() && Subtarget.useHVXFloatingPoint()) {
+      addRegisterClass(MVT::v32f32, &Hexagon::HvxVRRegClass);
+      addRegisterClass(MVT::v64f16, &Hexagon::HvxVRRegClass);
+      addRegisterClass(MVT::v64f32, &Hexagon::HvxWRRegClass);
+      addRegisterClass(MVT::v128f16, &Hexagon::HvxWRRegClass);
+    }
   }
 
   // Set up operation actions.
@@ -83,6 +89,27 @@
   setOperationAction(ISD::VECTOR_SHUFFLE,     ByteW,      Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
+  if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() &&
+      Subtarget.useHVXFloatingPoint()) {
+    // Handle ISD::BUILD_VECTOR for v32f32 in a custom way to generate vsplat
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v32f32, Custom);
+
+    // BUILD_VECTOR with f16 operands cannot be promoted without
+    // promoting the result, so lower the node to vsplat or constant pool
+    setOperationAction(ISD::BUILD_VECTOR,      MVT::f16,    Custom);
+    setOperationAction(ISD::SPLAT_VECTOR,      MVT::f16,    Custom);
+    setOperationAction(ISD::SPLAT_VECTOR,      MVT::v64f16, Legal);
+    setOperationAction(ISD::SPLAT_VECTOR,      MVT::v32f32, Legal);
+
+    // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
+    // independent) handling of it would convert it to a load, which is
+    // not always the optimal choice.
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v64f32, Custom);
+    // Make concat-vectors custom to handle concats of more than 2 vectors.
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v128f16, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v64f32, Custom);
+  }
+
   for (MVT T : LegalV) {
     setIndexedLoadAction(ISD::POST_INC,  T, Legal);
     setIndexedStoreAction(ISD::POST_INC, T, Legal);
@@ -497,7 +524,9 @@
   assert(ElemSize*VecLen == HwLen);
   SmallVector<SDValue,32> Words;
 
-  if (VecTy.getVectorElementType() != MVT::i32) {
+  if (VecTy.getVectorElementType() != MVT::i32 &&
+      !(Subtarget.useHVXFloatingPoint() &&
+      VecTy.getVectorElementType() == MVT::f32)) {
     assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size");
     unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2;
     MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord);
@@ -506,22 +535,31 @@
       Words.push_back(DAG.getBitcast(MVT::i32, W));
     }
   } else {
-    Words.assign(Values.begin(), Values.end());
+    for (SDValue V : Values)
+      Words.push_back(DAG.getBitcast(MVT::i32, V));
   }
+  auto isSplat = [] (ArrayRef<SDValue> Values, SDValue &SplatV) {
+    unsigned NumValues = Values.size();
+    assert(NumValues > 0);
+    bool IsUndef = true;
+    for (unsigned i = 0; i != NumValues; ++i) {
+      if (Values[i].isUndef())
+        continue;
+      IsUndef = false;
+      if (!SplatV.getNode())
+        SplatV = Values[i];
+      else if (SplatV != Values[i])
+        return false;
+    }
+    if (IsUndef)
+      SplatV = Values[0];
+    return true;
+  };
 
   unsigned NumWords = Words.size();
-  bool IsSplat = true, IsUndef = true;
   SDValue SplatV;
-  for (unsigned i = 0; i != NumWords && IsSplat; ++i) {
-    if (isUndef(Words[i]))
-      continue;
-    IsUndef = false;
-    if (!SplatV.getNode())
-      SplatV = Words[i];
-    else if (SplatV != Words[i])
-      IsSplat = false;
-  }
-  if (IsUndef)
+  bool IsSplat = isSplat(Words, SplatV);
+  if (IsSplat && isUndef(SplatV))
     return DAG.getUNDEF(VecTy);
   if (IsSplat) {
     assert(SplatV.getNode());
@@ -618,24 +656,44 @@
     }
   }
 
-  // Construct two halves in parallel, then or them together.
+  // Find most common element to initialize vector with. This is to avoid
+  // unnecessary vinsert/valign for cases where the same value is present
+  // many times. Creates a histogram of the vector's elements to find the
+  // most common element.
   assert(4*Words.size() == Subtarget.getVectorLength());
-  SDValue HalfV0 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
-  SDValue HalfV1 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
-  SDValue S = DAG.getConstant(4, dl, MVT::i32);
-  for (unsigned i = 0; i != NumWords/2; ++i) {
-    SDValue N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
-                            {HalfV0, Words[i]});
-    SDValue M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
-                            {HalfV1, Words[i+NumWords/2]});
-    HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, S});
-    HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, S});
+  SmallVector<int,32> VecHist(32);
+  int MaxAt = 0;
+  for (unsigned i = 0; i != NumWords; ++i) {
+    VecHist[i] = 0;
+    if (Words[i].isUndef())
+      continue;
+    for (unsigned j = i; j != NumWords; ++j)
+      if (Words[i] == Words[j])
+        VecHist[i]++;
+
+    if (VecHist[i] > VecHist[MaxAt])
+      MaxAt = i;
   }
 
-  HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy,
-                       {HalfV0, DAG.getConstant(HwLen/2, dl, MVT::i32)});
-  SDValue DstV = DAG.getNode(ISD::OR, dl, VecTy, {HalfV0, HalfV1});
-  return DstV;
+  // If each value is different, don't do splat, just insert them one by one.
+  bool NoSplat = VecHist[MaxAt] <= 1;
+  SDValue RotV = NoSplat
+                     ? DAG.getUNDEF(VecTy)
+                     : DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Words[MaxAt]);
+  int Rn = 0;
+  for (unsigned i = 0; i != NumWords; ++i) {
+    // Rotate by element count since last insertion.
+    if (NoSplat || Words[i] != Words[MaxAt]) {
+      RotV = DAG.getNode(HexagonISD::VROR, dl, VecTy,
+                         {RotV, DAG.getConstant(Rn, dl, MVT::i32)});
+      RotV = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, {RotV, Words[i]});
+      Rn = 0;
+    }
+    Rn += 4;
+  }
+  // Perform last rotation.
+  return DAG.getNode(HexagonISD::VROR, dl, VecTy,
+                     {RotV, DAG.getConstant(Rn, dl, MVT::i32)});
 }
 
 SDValue
@@ -1237,6 +1295,19 @@
   if (VecTy.getVectorElementType() == MVT::i1)
     return buildHvxVectorPred(Ops, dl, VecTy, DAG);
 
+  // In case of MVT::f16 BUILD_VECTOR, since MVT::f16 is
+  // not a legal type, just bitcast the node to use i16
+  // types and bitcast the result back to f16
+  if (VecTy.getVectorElementType() == MVT::f16) {
+    SmallVector<SDValue,64> NewOps;
+    for (unsigned i = 0; i != Size; i++)
+      NewOps.push_back(DAG.getBitcast(MVT::i16, Ops[i]));
+
+    SDValue T0 = DAG.getNode(ISD::BUILD_VECTOR, dl,
+        tyVector(VecTy, MVT::i16), NewOps);
+    return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0);
+  }
+
   if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
     ArrayRef<SDValue> A(Ops);
     MVT SingleTy = typeSplit(VecTy).first;
@@ -1248,6 +1319,24 @@
   return buildHvxVectorReg(Ops, dl, VecTy, DAG);
 }
 
+SDValue
+HexagonTargetLowering::LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG)
+      const {
+  const SDLoc &dl(Op);
+  MVT VecTy = ty(Op);
+  MVT ArgTy = ty(Op.getOperand(0));
+
+  if (ArgTy == MVT::f16) {
+    MVT SplatTy =  MVT::getVectorVT(MVT::i16, VecTy.getVectorNumElements());
+    SDValue ToInt16 = DAG.getBitcast(MVT::i16, Op.getOperand(0));
+    SDValue ToInt32 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, ToInt16);
+    SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, SplatTy, ToInt32);
+    return DAG.getBitcast(VecTy, Splat);
+  }
+
+  return SDValue();
+}
+
 SDValue
 HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG)
       const {
@@ -2134,6 +2223,7 @@
     default:
       break;
     case ISD::BUILD_VECTOR:            return LowerHvxBuildVector(Op, DAG);
+    case ISD::SPLAT_VECTOR:            return LowerHvxSplatVector(Op, DAG);
     case ISD::CONCAT_VECTORS:          return LowerHvxConcatVectors(Op, DAG);
     case ISD::INSERT_SUBVECTOR:        return LowerHvxInsertSubvector(Op, DAG);
     case ISD::INSERT_VECTOR_ELT:       return LowerHvxInsertElement(Op, DAG);
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -87,18 +87,6 @@
 def V4I16:  PatLeaf<(v4i16   DoubleRegs:$R)>;
 def V2I32:  PatLeaf<(v2i32   DoubleRegs:$R)>;
 
-def HQ8:    PatLeaf<(VecQ8   HvxQR:$R)>;
-def HQ16:   PatLeaf<(VecQ16  HvxQR:$R)>;
-def HQ32:   PatLeaf<(VecQ32  HvxQR:$R)>;
-
-def HVI8:   PatLeaf<(VecI8   HvxVR:$R)>;
-def HVI16:  PatLeaf<(VecI16  HvxVR:$R)>;
-def HVI32:  PatLeaf<(VecI32  HvxVR:$R)>;
-
-def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
-def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
-def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
-
 def SDTVecLeaf:
   SDTypeProfile<1, 0, [SDTCisVec<0>]>;
 def SDTVecVecIntOp:
@@ -269,6 +257,9 @@
 
 def f32ImmPred : PatLeaf<(f32 fpimm:$F)>;
 def f64ImmPred : PatLeaf<(f64 fpimm:$F)>;
+def f32zero: PatLeaf<(f32 fpimm:$F), [{
+  return N->isExactlyValue(APFloat::getZero(APFloat::IEEEsingle(), false));
+}]>;
 
 // This complex pattern is really only to detect various forms of
 // sign-extension i32->i64. The selected value will be of type i64
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -6,6 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+def HQ8:    PatLeaf<(VecQ8   HvxQR:$R)>;
+def HQ16:   PatLeaf<(VecQ16  HvxQR:$R)>;
+def HQ32:   PatLeaf<(VecQ32  HvxQR:$R)>;
+
+def HVI8:   PatLeaf<(VecI8   HvxVR:$R)>;
+def HVI16:  PatLeaf<(VecI16  HvxVR:$R)>;
+def HVI32:  PatLeaf<(VecI32  HvxVR:$R)>;
+def HVF16:  PatLeaf<(VecF16  HvxVR:$R)>;
+def HVF32:  PatLeaf<(VecF32  HvxVR:$R)>;
+
+def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
+def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
+def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
+def HWF16:  PatLeaf<(VecPF16 HvxWR:$R)>;
+def HWF32:  PatLeaf<(VecPF32 HvxWR:$R)>;
 
 def SDTVecUnaryOp:
   SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
@@ -49,7 +64,7 @@
 def HexagonVUNPACK:    SDNode<"HexagonISD::VUNPACK",    SDTVecUnaryOp>;
 def HexagonVUNPACKU:   SDNode<"HexagonISD::VUNPACKU",   SDTVecUnaryOp>;
 
-def vzero:  PatFrag<(ops), (splat_vector (i32 0))>;
+def vzero:  PatFrags<(ops), [(splat_vector (i32 0)), (splat_vector (f32zero))]>;
 def qtrue:  PatFrag<(ops), (HexagonQTRUE)>;
 def qfalse: PatFrag<(ops), (HexagonQFALSE)>;
 def qcat:   PatFrag<(ops node:$Qs, node:$Qt),
@@ -150,12 +165,19 @@
   defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecI8,  IsVecOff>;
   defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecI16, IsVecOff>;
   defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecI32, IsVecOff>;
-
   defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecI8,  IsVecOff>;
   defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecI16, IsVecOff>;
   defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecI32, IsVecOff>;
 }
 
+let Predicates = [UseHVXV68] in {
+  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF16, IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF32, IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecF16, IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecF32, IsVecOff>;
+  defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecF16, IsVecOff>;
+  defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecF32, IsVecOff>;
+}
 
 // HVX stores
 
@@ -199,6 +221,15 @@
   defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVI32, IsVecOff>;
 }
 
+let Predicates = [UseHVXV68] in {
+  defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVF16, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVF32, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32b_ai,               alignedstore, HVF16, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32b_ai,               alignedstore, HVF32, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVF16, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVF32, IsVecOff>;
+}
+
 // Bitcasts between same-size vector types are no-ops, except for the
 // actual type change.
 let Predicates = [UseHVX] in {
@@ -211,6 +242,24 @@
   defm: NopCast_pat<VecPI16, VecPI32, HvxWR>;
 }
 
+let Predicates = [UseHVX, UseHVXFloatingPoint] in {
+  defm: NopCast_pat<VecI8,   VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI8,   VecF32,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecF32,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecF32,  HvxVR>;
+  defm: NopCast_pat<VecF16,  VecF32,  HvxVR>;
+
+  defm: NopCast_pat<VecPI8,  VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI8,  VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPI16, VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI16, VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPI32, VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI32, VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPF16, VecPF32, HvxWR>;
+}
+
 let Predicates = [UseHVX] in {
   let AddedComplexity = 100 in {
     // These should be preferred over a vsplat of 0.
@@ -220,6 +269,7 @@
     def: Pat<(VecPI8  vzero), (PS_vdd0)>;
     def: Pat<(VecPI16 vzero), (PS_vdd0)>;
     def: Pat<(VecPI32 vzero), (PS_vdd0)>;
+    def: Pat<(VecPF32 vzero), (PS_vdd0)>;
 
     def: Pat<(concat_vectors  (VecI8 vzero),  (VecI8 vzero)), (PS_vdd0)>;
     def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>;
@@ -251,6 +301,28 @@
            (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
 }
 
+let Predicates = [UseHVX, UseHVXFloatingPoint] in {
+  let AddedComplexity = 100 in {
+    def: Pat<(VecF16  vzero), (V6_vd0)>;
+    def: Pat<(VecF32  vzero), (V6_vd0)>;
+    def: Pat<(VecPF16 vzero), (PS_vdd0)>;
+    def: Pat<(VecPF32 vzero), (PS_vdd0)>;
+
+    def: Pat<(concat_vectors (VecF16 vzero), (VecF16 vzero)), (PS_vdd0)>;
+    def: Pat<(concat_vectors (VecF32 vzero), (VecF32 vzero)), (PS_vdd0)>;
+  }
+
+  def: Pat<(VecPF16 (concat_vectors HVF16:$Vs, HVF16:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(VecPF32 (concat_vectors HVF32:$Vs, HVF32:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+
+  def: Pat<(HexagonVINSERTW0 HVF16:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+  def: Pat<(HexagonVINSERTW0 HVF32:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+}
+
 // Splats for HvxV60
 def V60splatib: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatB $V)))>;
 def V60splatih: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatH $V)))>;
@@ -307,6 +379,18 @@
     def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V62splatrw $Rs))>;
   }
 }
+let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
+  let AddedComplexity = 30 in {
+    def: Pat<(VecF16  (splat_vector u16_0ImmPred:$V)), (V62splatih imm:$V)>;
+    def: Pat<(VecF32  (splat_vector anyint:$V)),       (V62splatiw imm:$V)>;
+    def: Pat<(VecF32  (splat_vector f32ImmPred:$V)),   (V62splatiw (ftoi $V))>;
+  }
+  let AddedComplexity = 20 in {
+    def: Pat<(VecF16  (splat_vector I32:$Rs)), (V62splatrh $Rs)>;
+    def: Pat<(VecF32  (splat_vector I32:$Rs)), (V62splatrw $Rs)>;
+    def: Pat<(VecF32  (splat_vector F32:$Rs)), (V62splatrw $Rs)>;
+  }
+}
 
 class Vneg1<ValueType VecTy>
   : PatFrag<(ops), (VecTy (splat_vector (i32 -1)))>;
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -479,6 +479,10 @@
                                [v32i16, v64i16,  v32i16]>;
 def VecI32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v16i32, v32i32,  v16i32]>;
+def VecF16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32f16, v64f16,  v32f16]>;
+def VecF32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v16f32, v32f32,  v16f32]>;
 
 def VecPI8:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v128i8, v256i8,  v128i8]>;
@@ -486,6 +490,10 @@
                                [v64i16, v128i16, v64i16]>;
 def VecPI32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v32i32, v64i32,  v32i32]>;
+def VecPF16: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v64f16, v128f16, v64f16]>;
+def VecPF32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32f32, v64f32,  v32f32]>;
 
 def VecQ8:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v64i1,  v128i1,  v64i1]>;
@@ -496,13 +504,13 @@
 
 // HVX register classes
 
-def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512,
+def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32, VecF16, VecF32], 512,
   (add (sequence "V%u", 0, 31), VTMP)> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
     [RegInfo<512,512,512>, RegInfo<1024,1024,1024>, RegInfo<512,512,512>]>;
 }
 
-def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32], 1024,
+def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32, VecPF16, VecPF32], 1024,
   (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
     [RegInfo<1024,1024,1024>, RegInfo<2048,2048,2048>, RegInfo<1024,1024,1024>]>;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -81,6 +81,10 @@
   void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue);
 
   bool registerUsed(unsigned Register);
+
+  /// \return a tuple of: pointer to the producer instruction or nullptr if
+  /// none was found, the operand index, and the PredicateInfo for the
+  /// producer.
   std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo>
   registerProducer(unsigned Register,
                    HexagonMCInstrInfo::PredicateInfo Predicated);
@@ -103,11 +107,6 @@
 
   static void compoundRegisterMap(unsigned &);
 
-  bool isPredicateRegister(unsigned R) const {
-    return (Hexagon::P0 == R || Hexagon::P1 == R || Hexagon::P2 == R ||
-            Hexagon::P3 == R);
-  }
-
   bool isLoopRegister(unsigned R) const {
     return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R ||
             Hexagon::LC1 == R);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -16,6 +16,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
+
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
@@ -65,7 +66,8 @@
 
 void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
                                bool &isTrue) {
-  if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+  if (HexagonMCInstrInfo::isPredicated(MCII, MCI) &&
+      HexagonMCInstrInfo::isPredReg(RI, R)) {
     // Note an used predicate register.
     PredReg = R;
     isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
@@ -123,7 +125,7 @@
         // same packet with an instruction that modifies is explicitly. Deal
         // with such situations individually.
         SoftDefs.insert(R);
-      else if (isPredicateRegister(R) &&
+      else if (HexagonMCInstrInfo::isPredReg(RI, R) &&
                HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
         // Include implicit late predicates.
         LatePreds.insert(R);
@@ -167,7 +169,7 @@
         // side-effect, then note as a soft definition.
         SoftDefs.insert(*SRI);
       else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) &&
-               isPredicateRegister(*SRI))
+               HexagonMCInstrInfo::isPredReg(RI, *SRI))
         // Some insns produce predicates too late to be used in the same packet.
         LatePreds.insert(*SRI);
       else if (i == 0 && HexagonMCInstrInfo::getType(MCII, MCI) ==
@@ -193,7 +195,7 @@
       if (MCI.getOperand(i).isReg()) {
         unsigned P = MCI.getOperand(i).getReg();
 
-        if (isPredicateRegister(P))
+        if (HexagonMCInstrInfo::isPredReg(RI, P))
           NewPreds.insert(P);
       }
 }
@@ -315,8 +317,7 @@
 
 void HexagonMCChecker::reportBranchErrors() {
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
-    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
-    if (Desc.isBranch() || Desc.isCall() || Desc.isReturn())
+    if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I))
       reportNote(I.getLoc(), "Branching instruction");
   }
 }
@@ -326,8 +327,7 @@
       !HexagonMCInstrInfo::isOuterLoop(MCB))
     return true;
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
-    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
-    if (Desc.isBranch() || Desc.isCall() || Desc.isReturn()) {
+    if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I)) {
       reportError(MCB.getLoc(),
                   "Branches cannot be in a packet with hardware loops");
       reportBranchErrors();
@@ -340,8 +340,7 @@
 bool HexagonMCChecker::checkCOFMax1() {
   SmallVector<MCInst const *, 2> BranchLocations;
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
-    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
-    if (Desc.isBranch() || Desc.isCall() || Desc.isReturn())
+    if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I))
       BranchLocations.push_back(&I);
   }
   for (unsigned J = 0, N = BranchLocations.size(); J < N; ++J) {
@@ -424,81 +423,109 @@
 
 // Check legal use of new values.
 bool HexagonMCChecker::checkNewValues() {
-  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
-    if (!HexagonMCInstrInfo::isNewValue(MCII, I))
+  for (auto const &ConsumerInst :
+       HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
+    if (!HexagonMCInstrInfo::isNewValue(MCII, ConsumerInst))
       continue;
-    auto Consumer = HexagonMCInstrInfo::predicateInfo(MCII, I);
-    bool Branch = HexagonMCInstrInfo::getDesc(MCII, I).isBranch();
-    MCOperand const &Op = HexagonMCInstrInfo::getNewValueOperand(MCII, I);
+
+    const HexagonMCInstrInfo::PredicateInfo ConsumerPredInfo =
+        HexagonMCInstrInfo::predicateInfo(MCII, ConsumerInst);
+
+    bool Branch = HexagonMCInstrInfo::getDesc(MCII, ConsumerInst).isBranch();
+    MCOperand const &Op =
+        HexagonMCInstrInfo::getNewValueOperand(MCII, ConsumerInst);
     assert(Op.isReg());
-    auto Producer = registerProducer(Op.getReg(), Consumer);
-    if (std::get<0>(Producer) == nullptr) {
-      reportError(I.getLoc(), "New value register consumer has no producer");
+
+    auto Producer = registerProducer(Op.getReg(), ConsumerPredInfo);
+    const MCInst *const ProducerInst = std::get<0>(Producer);
+    const HexagonMCInstrInfo::PredicateInfo ProducerPredInfo =
+        std::get<2>(Producer);
+
+    if (ProducerInst == nullptr) {
+      reportError(ConsumerInst.getLoc(),
+                  "New value register consumer has no producer");
       return false;
     }
     if (!RelaxNVChecks) {
       // Checks that statically prove correct new value consumption
-      if (std::get<2>(Producer).isPredicated() &&
-          (!Consumer.isPredicated() ||
-           llvm::HexagonMCInstrInfo::getType(MCII, I) == HexagonII::TypeNCJ)) {
+      if (ProducerPredInfo.isPredicated() &&
+          (!ConsumerPredInfo.isPredicated() ||
+           llvm::HexagonMCInstrInfo::getType(MCII, ConsumerInst) ==
+               HexagonII::TypeNCJ)) {
         reportNote(
-            std::get<0>(Producer)->getLoc(),
+            ProducerInst->getLoc(),
             "Register producer is predicated and consumer is unconditional");
-        reportError(I.getLoc(),
+        reportError(ConsumerInst.getLoc(),
                     "Instruction does not have a valid new register producer");
         return false;
       }
-      if (std::get<2>(Producer).Register != Hexagon::NoRegister &&
-          std::get<2>(Producer).Register != Consumer.Register) {
-        reportNote(std::get<0>(Producer)->getLoc(),
+      if (ProducerPredInfo.Register != Hexagon::NoRegister &&
+          ProducerPredInfo.Register != ConsumerPredInfo.Register) {
+        reportNote(ProducerInst->getLoc(),
                    "Register producer does not use the same predicate "
                    "register as the consumer");
-        reportError(I.getLoc(),
+        reportError(ConsumerInst.getLoc(),
                     "Instruction does not have a valid new register producer");
         return false;
       }
     }
-    if (std::get<2>(Producer).Register == Consumer.Register &&
-        Consumer.PredicatedTrue != std::get<2>(Producer).PredicatedTrue) {
+    if (ProducerPredInfo.Register == ConsumerPredInfo.Register &&
+        ConsumerPredInfo.PredicatedTrue != ProducerPredInfo.PredicatedTrue) {
       reportNote(
-          std::get<0>(Producer)->getLoc(),
+          ProducerInst->getLoc(),
           "Register producer has the opposite predicate sense as consumer");
-      reportError(I.getLoc(),
+      reportError(ConsumerInst.getLoc(),
                   "Instruction does not have a valid new register producer");
       return false;
     }
-    MCInstrDesc const &Desc =
-        HexagonMCInstrInfo::getDesc(MCII, *std::get<0>(Producer));
-    if (Desc.OpInfo[std::get<1>(Producer)].RegClass ==
+
+    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, *ProducerInst);
+    const unsigned ProducerOpIndex = std::get<1>(Producer);
+
+    if (Desc.OpInfo[ProducerOpIndex].RegClass ==
         Hexagon::DoubleRegsRegClassID) {
-      reportNote(std::get<0>(Producer)->getLoc(),
+      reportNote(ProducerInst->getLoc(),
                  "Double registers cannot be new-value producers");
-      reportError(I.getLoc(),
+      reportError(ConsumerInst.getLoc(),
                   "Instruction does not have a valid new register producer");
       return false;
     }
-    if ((Desc.mayLoad() && std::get<1>(Producer) == 1) ||
-        (Desc.mayStore() && std::get<1>(Producer) == 0)) {
-      unsigned Mode =
-          HexagonMCInstrInfo::getAddrMode(MCII, *std::get<0>(Producer));
+
+    // The ProducerOpIsMemIndex logic checks for the index of the producer
+    // register operand.  Z-reg load instructions have an implicit operand
+    // that's not encoded, so the producer won't appear as the 1-th def, it
+    // will be at the 0-th.
+    const unsigned ProducerOpSearchIndex =
+        (HexagonMCInstrInfo::getType(MCII, *ProducerInst) ==
+         HexagonII::TypeCVI_ZW)
+            ? 0
+            : 1;
+
+    const bool ProducerOpIsMemIndex =
+        ((Desc.mayLoad() && ProducerOpIndex == ProducerOpSearchIndex) ||
+         (Desc.mayStore() && ProducerOpIndex == 0));
+
+    if (ProducerOpIsMemIndex) {
+      unsigned Mode = HexagonMCInstrInfo::getAddrMode(MCII, *ProducerInst);
+
       StringRef ModeError;
       if (Mode == HexagonII::AbsoluteSet)
         ModeError = "Absolute-set";
       if (Mode == HexagonII::PostInc)
         ModeError = "Auto-increment";
       if (!ModeError.empty()) {
-        reportNote(std::get<0>(Producer)->getLoc(),
+        reportNote(ProducerInst->getLoc(),
                    ModeError + " registers cannot be a new-value "
                                "producer");
-        reportError(I.getLoc(),
+        reportError(ConsumerInst.getLoc(),
                     "Instruction does not have a valid new register producer");
         return false;
       }
     }
-    if (Branch && HexagonMCInstrInfo::isFloat(MCII, *std::get<0>(Producer))) {
-      reportNote(std::get<0>(Producer)->getLoc(),
+    if (Branch && HexagonMCInstrInfo::isFloat(MCII, *ProducerInst)) {
+      reportNote(ProducerInst->getLoc(),
                  "FPU instructions cannot be new-value producers for jumps");
-      reportError(I.getLoc(),
+      reportError(ConsumerInst.getLoc(),
                   "Instruction does not have a valid new register producer");
       return false;
     }
@@ -541,9 +568,11 @@
     unsigned Register, HexagonMCInstrInfo::PredicateInfo ConsumerPredicate) {
   std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo>
       WrongSense;
+
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
     MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
     auto ProducerPredicate = HexagonMCInstrInfo::predicateInfo(MCII, I);
+
     for (unsigned J = 0, N = Desc.getNumDefs(); J < N; ++J)
       for (auto K = MCRegAliasIterator(I.getOperand(J).getReg(), &RI, true);
            K.isValid(); ++K)
@@ -599,7 +628,7 @@
       reportErrorRegisters(BadR);
       return false;
     }
-    if (!isPredicateRegister(R) && Defs[R].size() > 1) {
+    if (!HexagonMCInstrInfo::isPredReg(RI, R) && Defs[R].size() > 1) {
       // Check for multiple register definitions.
       PredSet &PM = Defs[R];
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -95,6 +95,8 @@
                         MCContext &Context, MCInst &MCB,
                         HexagonMCChecker *Checker,
                         bool AttemptCompatibility = false);
+bool IsABranchingInst(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst const &I);
 
 // Create a duplex instruction given the two subinsts
 MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -1030,3 +1030,11 @@
     return Consumer == Producer;
   return 0;
 }
+
+bool HexagonMCInstrInfo::IsABranchingInst(MCInstrInfo const &MCII,
+                                          MCSubtargetInfo const &STI,
+                                          MCInst const &I) {
+  assert(!HexagonMCInstrInfo::isBundle(I));
+  MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
+  return (Desc.isBranch() || Desc.isCall() || Desc.isReturn());
+}
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
@@ -148,7 +148,7 @@
 
 void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &OS, const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg())
     OS << "%" << getRegisterName(Op.getReg());
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -40,9 +40,8 @@
                                std::unique_ptr<MCAsmBackend> MAB,
                                std::unique_ptr<MCObjectWriter> OW,
                                std::unique_ptr<MCCodeEmitter> Emitter)
-    : MCELFStreamer(Context, std::move(MAB), std::move(OW),
-                    std::move(Emitter)), LastLabel(NULL) {
-}
+    : MCELFStreamer(Context, std::move(MAB), std::move(OW), std::move(Emitter)),
+      LastLabel(nullptr) {}
 
 void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst,
                                              const MCSubtargetInfo &STI) {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -5266,7 +5266,7 @@
     if (!Register::isVirtualRegister(SrcReg))
       return false;
     const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
-    if (SrcMI != NULL)
+    if (SrcMI != nullptr)
       return isSignOrZeroExtended(*SrcMI, SignExt, Depth);
 
     return false;
@@ -5290,7 +5290,7 @@
     if (!Register::isVirtualRegister(SrcReg))
       return false;
     const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
-    if (SrcMI != NULL)
+    if (SrcMI != nullptr)
       return isSignOrZeroExtended(*SrcMI, SignExt, Depth);
 
     return false;
@@ -5319,7 +5319,8 @@
         if (!Register::isVirtualRegister(SrcReg))
           return false;
         const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
-        if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1))
+        if (SrcMI == nullptr ||
+            !isSignOrZeroExtended(*SrcMI, SignExt, Depth + 1))
           return false;
       }
       else
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -85,7 +85,7 @@
 void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI, raw_ostream &O,
                                     const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &MO = MI->getOperand(OpNo);
 
   if (MO.isReg()) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1727,6 +1727,20 @@
       if (Bits < Subtarget->getXLen() - User->getConstantOperandVal(1))
         return false;
       break;
+    case RISCV::ANDI:
+      if (Bits < (64 - countLeadingZeros(User->getConstantOperandVal(1))))
+        return false;
+      break;
+    case RISCV::SEXTB:
+      if (Bits < 8)
+        return false;
+      break;
+    case RISCV::SEXTH:
+    case RISCV::ZEXTH_RV32:
+    case RISCV::ZEXTH_RV64:
+      if (Bits < 16)
+        return false;
+      break;
     case RISCV::ADDUW:
     case RISCV::SH1ADDUW:
     case RISCV::SH2ADDUW:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8393,7 +8393,8 @@
         LocVT = XLenVT;
         LocInfo = CCValAssign::Indirect;
       } else if (ValVT.isScalableVector()) {
-        report_fatal_error("Unable to pass scalable vector types on the stack");
+        LocVT = XLenVT;
+        LocInfo = CCValAssign::Indirect;
       } else {
         // Pass fixed-length vectors on the stack.
         LocVT = ValVT;
@@ -8592,6 +8593,12 @@
   EVT LocVT = VA.getLocVT();
   EVT ValVT = VA.getValVT();
   EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
+  if (ValVT.isScalableVector()) {
+    // When the value is a scalable vector, we save the pointer which points to
+    // the scalable vector value in the stack. The ValVT will be the pointer
+    // type, instead of the scalable vector type.
+    ValVT = LocVT;
+  }
   int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), VA.getLocMemOffset(),
                                  /*Immutable=*/true);
   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -201,8 +201,9 @@
       if (MBBI->modifiesRegister(RISCV::VL))
         return false;
 
-      // Go through all defined operands, including implicit defines.
-      for (const MachineOperand &MO : MBBI->operands()) {
+      // Only converting whole register copies to vmv.v.v when the defining
+      // value appears in the explicit operands.
+      for (const MachineOperand &MO : MBBI->explicit_operands()) {
         if (!MO.isReg() || !MO.isDef())
           continue;
         if (!FoundDef && TRI->isSubRegisterEq(MO.getReg(), SrcReg)) {
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -10537,13 +10537,12 @@
 
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
                                  string OpcodeStr, Predicate prd> {
-// TODO - Replace WriteMove with WriteVecTrunc?
 let Predicates = [prd] in
-  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr, WriteMove>, EVEX_V512;
+  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr, WriteVecMoveZ>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr, WriteMove>, EVEX_V256;
-    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr, WriteMove>, EVEX_V128;
+    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr, WriteVecMoveY>, EVEX_V256;
+    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr, WriteVecMoveX>, EVEX_V128;
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -255,6 +255,7 @@
 defm : X86WriteRes<WriteFMove,         [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [BWPort5], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 defm : X86WriteRes<WriteEMMS,          [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
 
 defm : BWWriteResPair<WriteFAdd,    [BWPort1],  3, [1], 1, 5>; // Floating point add/sub.
@@ -418,6 +419,7 @@
 defm : X86WriteRes<WriteVecMove,         [BWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [BWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [BWPort015], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [BWPort0], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [BWPort5], 1, [1], 1>;
 
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -257,6 +257,7 @@
 defm : X86WriteRes<WriteFMove,         [HWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [HWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [HWPort5], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 defm : X86WriteRes<WriteEMMS,          [HWPort01,HWPort15,HWPort015,HWPort0156], 31, [8,1,21,1], 31>;
 
 defm : HWWriteResPair<WriteFAdd,    [HWPort1],  3, [1], 1, 5>;
@@ -416,6 +417,7 @@
 defm : X86WriteRes<WriteVecMove,         [HWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [HWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [HWPort015], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [HWPort0], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [HWPort5], 1, [1], 1>;
 
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -252,6 +252,7 @@
 defm : X86WriteRes<WriteFMove,         [ICXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [ICXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveZ,        [ICXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteEMMS,          [ICXPort05,ICXPort0156], 10, [9,1], 10>;
 
 defm : ICXWriteResPair<WriteFAdd,      [ICXPort01],  4, [1], 1, 5>; // Floating point add/sub.
@@ -367,6 +368,7 @@
 defm : X86WriteRes<WriteVecMove,         [ICXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [ICXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveZ,        [ICXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [ICXPort0], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [ICXPort5], 1, [1], 1>;
 
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -223,6 +223,7 @@
 defm : X86WriteRes<WriteFMove,         [SBPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SBPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveZ,        [SBPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteEMMS,          [SBPort015], 31, [31], 31>;
 
 defm : SBWriteResPair<WriteFAdd,    [SBPort1],  3, [1], 1, 6>;
@@ -380,6 +381,7 @@
 defm : X86WriteRes<WriteVecMove,         [SBPort05], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SBPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveZ,        [SBPort05], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [SBPort0], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [SBPort5], 1, [1], 1>;
 
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -244,6 +244,7 @@
 defm : X86WriteRes<WriteFMove,         [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SKLPort015], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 defm : X86WriteRes<WriteEMMS,          [SKLPort05,SKLPort0156], 10, [9,1], 10>;
 
 defm : SKLWriteResPair<WriteFAdd,     [SKLPort01],  4, [1], 1, 5>; // Floating point add/sub.
@@ -359,6 +360,7 @@
 defm : X86WriteRes<WriteVecMove,         [SKLPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SKLPort015], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [SKLPort0], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [SKLPort5], 1, [1], 1>;
 
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -244,6 +244,7 @@
 defm : X86WriteRes<WriteFMove,         [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveZ,        [SKXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteEMMS,          [SKXPort05,SKXPort0156], 10, [9,1], 10>;
 
 defm : SKXWriteResPair<WriteFAdd,      [SKXPort01],  4, [1], 1, 5>; // Floating point add/sub.
@@ -359,6 +360,7 @@
 defm : X86WriteRes<WriteVecMove,         [SKXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveZ,        [SKXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [SKXPort0], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [SKXPort5], 1, [1], 1>;
 
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -239,6 +239,7 @@
 def  WriteFMove         : SchedWrite;
 def  WriteFMoveX        : SchedWrite;
 def  WriteFMoveY        : SchedWrite;
+def  WriteFMoveZ        : SchedWrite;
 
 defm WriteFAdd    : X86SchedWritePair<ReadAfterVecLd>;  // Floating point add/sub.
 defm WriteFAddX   : X86SchedWritePair<ReadAfterVecXLd>; // Floating point add/sub (XMM).
@@ -354,6 +355,7 @@
 def  WriteVecMove         : SchedWrite;
 def  WriteVecMoveX        : SchedWrite;
 def  WriteVecMoveY        : SchedWrite;
+def  WriteVecMoveZ        : SchedWrite;
 def  WriteVecMoveToGpr    : SchedWrite;
 def  WriteVecMoveFromGpr  : SchedWrite;
 
@@ -516,9 +518,11 @@
  : X86SchedWriteMoveLS<WriteFMoveX, WriteFLoadX, WriteFStoreX>;
 def WriteFMoveLSY
  : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreY>;
+def WriteFMoveLSZ
+ : X86SchedWriteMoveLS<WriteFMoveZ, WriteFLoadY, WriteFStoreY>;
 def SchedWriteFMoveLS
   : X86SchedWriteMoveLSWidths<WriteFMoveLS, WriteFMoveLSX,
-                              WriteFMoveLSY, WriteFMoveLSY>;
+                              WriteFMoveLSY, WriteFMoveLSZ>;
 
 def WriteFMoveLSNT
  : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNT>;
@@ -536,9 +540,11 @@
  : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadX, WriteVecStoreX>;
 def WriteVecMoveLSY
  : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadY, WriteVecStoreY>;
+def WriteVecMoveLSZ
+ : X86SchedWriteMoveLS<WriteVecMoveZ, WriteVecLoadY, WriteVecStoreY>;
 def SchedWriteVecMoveLS
   : X86SchedWriteMoveLSWidths<WriteVecMoveLS, WriteVecMoveLSX,
-                              WriteVecMoveLSY, WriteVecMoveLSY>;
+                              WriteVecMoveLSY, WriteVecMoveLSZ>;
 
 def WriteVecMoveLSNT
  : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoadNT, WriteVecStoreNT>;
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -229,6 +229,7 @@
 def  : WriteRes<WriteFMove,         [AtomPort01]>;
 def  : WriteRes<WriteFMoveX,        [AtomPort01]>;
 defm : X86WriteResUnsupported<WriteFMoveY>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : X86WriteRes<WriteEMMS,       [AtomPort01], 5, [5], 1>;
 
@@ -382,6 +383,7 @@
 def  : WriteRes<WriteVecMove,          [AtomPort0]>;
 def  : WriteRes<WriteVecMoveX,        [AtomPort01]>;
 defm : X86WriteResUnsupported<WriteVecMoveY>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,   [AtomPort0], 3, [3], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr, [AtomPort0], 1, [1], 1>;
 
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -772,6 +772,7 @@
 defm : PdWriteRes<WriteFMove,              [PdFPU01, PdFPFMA]>;
 defm : PdWriteRes<WriteFMoveX,             [PdFPU01, PdFPFMA], 1, [1, 2]>;
 defm : PdWriteRes<WriteFMoveY,             [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : PdWriteRes<WriteEMMS,               [PdFPU01, PdFPFMA], 2>;
 
@@ -1107,6 +1108,7 @@
 defm : PdWriteRes<WriteVecMove,             [PdFPU01, PdFPMAL], 2>;
 defm : PdWriteRes<WriteVecMoveX,            [PdFPU01, PdFPMAL], 1, [1, 2]>;
 defm : PdWriteRes<WriteVecMoveY,            [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 
 def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
 }
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -525,6 +525,7 @@
 defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
 
@@ -682,6 +683,7 @@
 defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
 
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -200,6 +200,7 @@
 def  : WriteRes<WriteFMove,         [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteFMoveX,        [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteFMoveY,        [SLM_FPC_RSV01]>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 defm : X86WriteRes<WriteEMMS,       [SLM_FPC_RSV01], 10, [10], 9>;
 
 defm : SLMWriteResPair<WriteFAdd,     [SLM_FPC_RSV1], 3>;
@@ -345,6 +346,7 @@
 def  : WriteRes<WriteVecMove,         [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteVecMoveX,        [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteVecMoveY,        [SLM_FPC_RSV01]>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 def  : WriteRes<WriteVecMoveToGpr,    [SLM_IEC_RSV01]>;
 def  : WriteRes<WriteVecMoveFromGpr,  [SLM_IEC_RSV01]>;
 
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -286,6 +286,7 @@
 defm : X86WriteRes<WriteFMove,         [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [ZnFPU], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : ZnWriteResFpuPair<WriteFAdd,      [ZnFPU0],  3>;
 defm : ZnWriteResFpuPair<WriteFAddX,     [ZnFPU0],  3>;
@@ -404,6 +405,7 @@
 defm : X86WriteRes<WriteVecMove,         [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [ZnFPU], 2, [1], 2>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [ZnFPU2], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [ZnFPU2], 3, [1], 1>;
 defm : X86WriteRes<WriteEMMS,            [ZnFPU], 2, [1], 1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -274,6 +274,7 @@
 defm : X86WriteRes<WriteFMove,         [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [Zn2FPU], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : Zn2WriteResFpuPair<WriteFAdd,      [Zn2FPU0],  3>;
 defm : Zn2WriteResFpuPair<WriteFAddX,     [Zn2FPU0],  3>;
@@ -388,6 +389,7 @@
 defm : X86WriteRes<WriteVecMove,         [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [Zn2FPU], 2, [1], 2>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [Zn2FPU2], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [Zn2FPU2], 3, [1], 1>;
 defm : X86WriteRes<WriteEMMS,            [Zn2FPU], 2, [1], 1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -1446,10 +1446,12 @@
 defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
 defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
 defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
 defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
 defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 
 def : IsOptimizableRegisterMove<[
   InstructionEquivalenceClass<[
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -43,6 +43,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 
@@ -5187,9 +5188,48 @@
   const FeatureBitset &CalleeBits =
       TM.getSubtargetImpl(*Callee)->getFeatureBits();
 
+  // Check whether features are the same (apart from the ignore list).
   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
-  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
+  if (RealCallerBits == RealCalleeBits)
+    return true;
+
+  // If the features are a subset, we need to additionally check for calls
+  // that may become ABI-incompatible as a result of inlining.
+  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
+    return false;
+
+  for (const Instruction &I : instructions(Callee)) {
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      SmallVector<Type *, 8> Types;
+      for (Value *Arg : CB->args())
+        Types.push_back(Arg->getType());
+      if (!CB->getType()->isVoidTy())
+        Types.push_back(CB->getType());
+
+      // Simple types are always ABI compatible.
+      auto IsSimpleTy = [](Type *Ty) {
+        return !Ty->isVectorTy() && !Ty->isAggregateType();
+      };
+      if (all_of(Types, IsSimpleTy))
+        continue;
+
+      if (Function *NestedCallee = CB->getCalledFunction()) {
+        // Assume that intrinsics are always ABI compatible.
+        if (NestedCallee->isIntrinsic())
+          continue;
+
+        // Do a precise compatibility check.
+        if (!areTypesABICompatible(Caller, NestedCallee, Types))
+          return false;
+      } else {
+        // We don't know the target features of the callee,
+        // assume it is incompatible.
+        return false;
+      }
+    }
+  }
+  return true;
 }
 
 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -202,9 +203,17 @@
   return NoRecurseAA.isAssumedNoRecurse();
 }
 
-Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty) {
+Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty,
+                                    const TargetLibraryInfo *TLI) {
   if (isa<AllocaInst>(Obj))
     return UndefValue::get(&Ty);
+  if (isNoAliasFn(&Obj, TLI)) {
+    if (isMallocLikeFn(&Obj, TLI) || isAlignedAllocLikeFn(&Obj, TLI))
+      return UndefValue::get(&Ty);
+    if (isCallocLikeFn(&Obj, TLI))
+      return Constant::getNullValue(&Ty);
+    return nullptr;
+  }
   auto *GV = dyn_cast<GlobalVariable>(&Obj);
   if (!GV || !GV->hasLocalLinkage())
     return nullptr;
@@ -300,6 +309,8 @@
   SmallVector<const AAPointerInfo *> PIs;
   SmallVector<Value *> NewCopies;
 
+  const auto *TLI =
+      A.getInfoCache().getTargetLibraryInfoForFunction(*SI.getFunction());
   for (Value *Obj : Objects) {
     LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n");
     if (isa<UndefValue>(Obj))
@@ -316,7 +327,8 @@
           dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
       return false;
     }
-    if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj)) {
+    if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj) &&
+        !isNoAliasFn(Obj, TLI)) {
       LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << *Obj
                         << "\n";);
       return false;
@@ -999,10 +1011,11 @@
   return false;
 }
 
-bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
-                                 const AbstractAttribute &QueryingAA,
-                                 const Value &V, bool CheckBBLivenessOnly,
-                                 DepClassTy LivenessDepClass) {
+bool Attributor::checkForAllUses(
+    function_ref<bool(const Use &, bool &)> Pred,
+    const AbstractAttribute &QueryingAA, const Value &V,
+    bool CheckBBLivenessOnly, DepClassTy LivenessDepClass,
+    function_ref<bool(const Use &OldU, const Use &NewU)> EquivalentUseCB) {
 
   // Check the trivial case first as it catches void values.
   if (V.use_empty())
@@ -1053,8 +1066,15 @@
                             << PotentialCopies.size()
                             << " potential copies instead!\n");
           for (Value *PotentialCopy : PotentialCopies)
-            for (const Use &U : PotentialCopy->uses())
-              Worklist.push_back(&U);
+            for (const Use &CopyUse : PotentialCopy->uses()) {
+              if (EquivalentUseCB && !EquivalentUseCB(*U, CopyUse)) {
+                LLVM_DEBUG(dbgs() << "[Attributor] Potential copy was "
+                                     "rejected by the equivalence call back: "
+                                  << *CopyUse << "!\n");
+                return false;
+              }
+              Worklist.push_back(&CopyUse);
+            }
           continue;
         }
       }
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1161,6 +1161,10 @@
       return true;
     };
 
+    const auto *TLI = getAnchorScope()
+                          ? A.getInfoCache().getTargetLibraryInfoForFunction(
+                                *getAnchorScope())
+                          : nullptr;
     auto UsePred = [&](const Use &U, bool &Follow) -> bool {
       Value *CurPtr = U.get();
       User *Usr = U.getUser();
@@ -1275,6 +1279,8 @@
       if (auto *CB = dyn_cast<CallBase>(Usr)) {
         if (CB->isLifetimeStartOrEnd())
           return true;
+        if (TLI && isFreeCall(CB, TLI))
+          return true;
         if (CB->isArgOperand(&U)) {
           unsigned ArgNo = CB->getArgOperandNo(&U);
           const auto &CSArgPI = A.getAAFor<AAPointerInfo>(
@@ -1293,8 +1299,15 @@
       LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
       return false;
     };
+    auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) {
+      if (OffsetInfoMap.count(NewU))
+        return OffsetInfoMap[NewU] == OffsetInfoMap[OldU];
+      OffsetInfoMap[NewU] = OffsetInfoMap[OldU];
+      return true;
+    };
     if (!A.checkForAllUses(UsePred, *this, AssociatedValue,
-                           /* CheckBBLivenessOnly */ true))
+                           /* CheckBBLivenessOnly */ true, DepClassTy::OPTIONAL,
+                           EquivalentUseCB))
       return indicatePessimisticFixpoint();
 
     LLVM_DEBUG({
@@ -2325,6 +2338,8 @@
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     AANoRecurseImpl::initialize(A);
+    // TODO: We should build a call graph ourselves to enable this in the module
+    // pass as well.
     if (const Function *F = getAnchorScope())
       if (A.getInfoCache().getSccSize(*F) != 1)
         indicatePessimisticFixpoint();
@@ -5236,6 +5251,8 @@
     if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, AA, &L))
       return false;
 
+    const auto *TLI =
+        A.getInfoCache().getTargetLibraryInfoForFunction(*L.getFunction());
     for (Value *Obj : Objects) {
       LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n");
       if (isa<UndefValue>(Obj))
@@ -5250,9 +5267,10 @@
           continue;
         return false;
       }
-      if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj))
+      if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj) &&
+          !isNoAliasFn(Obj, TLI))
         return false;
-      Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType());
+      Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType(), TLI);
       if (!InitialVal || !Union(*InitialVal))
         return false;
 
@@ -5929,6 +5947,8 @@
       }
 
       Align Alignment(1);
+      if (MaybeAlign RetAlign = AI.CB->getRetAlign())
+        Alignment = max(Alignment, RetAlign);
       if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) {
         Optional<APInt> AlignmentAPI =
             getAPInt(A, *this, *AI.CB->getArgOperand(0));
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -21,6 +21,7 @@
 
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -153,14 +154,6 @@
 
 namespace {
 
-enum class AddressSpace : unsigned {
-  Generic = 0,
-  Global = 1,
-  Shared = 3,
-  Constant = 4,
-  Local = 5,
-};
-
 struct AAHeapToShared;
 
 struct AAICVTracker;
@@ -1597,8 +1590,10 @@
             &F.getEntryBlock(), F.getEntryBlock().begin()));
       // Create a fallback location if non was found.
       // TODO: Use the debug locations of the calls instead.
-      Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
-      Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
+      uint32_t SrcLocStrSize;
+      Constant *Loc =
+          OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
+      Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
     }
     return Ident;
   }
@@ -2548,7 +2543,7 @@
   }
 
   /// Set of basic blocks that are executed by a single thread.
-  DenseSet<const BasicBlock *> SingleThreadedBBs;
+  SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs;
 
   /// Total number of basic blocks in this function.
   long unsigned NumBBs;
@@ -2572,7 +2567,7 @@
   if (!A.checkForAllCallSites(PredForCallSite, *this,
                               /* RequiresAllCallSites */ true,
                               AllCallSitesKnown))
-    SingleThreadedBBs.erase(&F->getEntryBlock());
+    SingleThreadedBBs.remove(&F->getEntryBlock());
 
   auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
   auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
@@ -2637,7 +2632,7 @@
 
   for (auto *BB : RPOT) {
     if (!MergePredecessorStates(BB))
-      SingleThreadedBBs.erase(BB);
+      SingleThreadedBBs.remove(BB);
   }
 
   return (NumSingleThreadedBBs == SingleThreadedBBs.size())
@@ -2786,7 +2781,10 @@
       };
       A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
 
-      SharedMem->setAlignment(MaybeAlign(32));
+      MaybeAlign Alignment = CB->getRetAlign();
+      assert(Alignment &&
+             "HeapToShared on allocation without alignment attribute");
+      SharedMem->setAlignment(MaybeAlign(Alignment));
 
       A.changeValueAfterManifest(*CB, *NewBuffer);
       A.deleteAfterManifest(*CB);
@@ -2813,7 +2811,7 @@
       if (CallBase *CB = dyn_cast<CallBase>(U))
         if (!isa<ConstantInt>(CB->getArgOperand(0)) ||
             !ED.isExecutedByInitialThreadOnly(*CB))
-          MallocCalls.erase(CB);
+          MallocCalls.remove(CB);
     }
 
     findPotentialRemovedFreeCalls(A);
@@ -2825,7 +2823,7 @@
   }
 
   /// Collection of all malloc calls in a function.
-  SmallPtrSet<CallBase *, 4> MallocCalls;
+  SmallSetVector<CallBase *, 4> MallocCalls;
   /// Collection of potentially removed free calls in a function.
   SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
 };
@@ -3225,8 +3223,11 @@
       OpenMPIRBuilder::LocationDescription Loc(
           InsertPointTy(ParentBB, ParentBB->end()), DL);
       OMPInfoCache.OMPBuilder.updateToLocation(Loc);
-      auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc);
-      Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr);
+      uint32_t SrcLocStrSize;
+      auto *SrcLocStr =
+          OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+      Value *Ident =
+          OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
       BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
 
       // Add check for Tid in RegionCheckTidBB
@@ -4508,6 +4509,8 @@
         bool UsedAssumedInformation = false;
         A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
                                UsedAssumedInformation);
+      } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
       }
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -503,7 +503,7 @@
 /// Returns true if we can rewrite Start as a GEP with pointer Base
 /// and some integer offset. The nodes that need to be re-written
 /// for this transformation will be added to Explored.
-static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
+static bool canRewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base,
                                   const DataLayout &DL,
                                   SetVector<Value *> &Explored) {
   SmallVector<Value *, 16> WorkList(1, Start);
@@ -551,7 +551,7 @@
         // the original pointer type. We could handle more cases in the
         // future.
         if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
-            GEP->getType() != Start->getType())
+            GEP->getSourceElementType() != ElemTy)
           return false;
 
         if (!Explored.contains(GEP->getOperand(0)))
@@ -627,7 +627,7 @@
 
 /// Returns a re-written value of Start as an indexed GEP using Base as a
 /// pointer.
-static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
+static Value *rewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base,
                                  const DataLayout &DL,
                                  SetVector<Value *> &Explored) {
   // Perform all the substitutions. This is a bit tricky because we can
@@ -714,6 +714,8 @@
     }
   }
 
+  PointerType *PtrTy =
+      ElemTy->getPointerTo(Start->getType()->getPointerAddressSpace());
   for (Value *Val : Explored) {
     if (Val == Base)
       continue;
@@ -722,22 +724,14 @@
     // a GEP or a GEP + ptrtoint.
     setInsertionPoint(Builder, Val, false);
 
-    // If required, create an inttoptr instruction for Base.
-    Value *NewBase = Base;
-    if (!Base->getType()->isPointerTy())
-      NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
-                                               Start->getName() + "to.ptr");
-
-    Value *GEP = Builder.CreateInBoundsGEP(
-        Start->getType()->getPointerElementType(), NewBase,
-        makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
-
-    if (!Val->getType()->isPointerTy()) {
-      Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
-                                              Val->getName() + ".conv");
-      GEP = Cast;
-    }
-    Val->replaceAllUsesWith(GEP);
+    // Cast base to the expected type.
+    Value *NewVal = Builder.CreateBitOrPointerCast(
+        Base, PtrTy, Start->getName() + "to.ptr");
+    NewVal = Builder.CreateInBoundsGEP(
+        ElemTy, NewVal, makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
+    NewVal = Builder.CreateBitOrPointerCast(
+        NewVal, Val->getType(), Val->getName() + ".conv");
+    Val->replaceAllUsesWith(NewVal);
   }
 
   return NewInsts[Start];
@@ -747,7 +741,7 @@
 /// the input Value as a constant indexed GEP. Returns a pair containing
 /// the GEPs Pointer and Index.
 static std::pair<Value *, Value *>
-getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
+getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) {
   Type *IndexType = IntegerType::get(V->getContext(),
                                      DL.getIndexTypeSizeInBits(V->getType()));
 
@@ -759,7 +753,7 @@
       if (!GEP->isInBounds())
         break;
       if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
-          GEP->getType() == V->getType()) {
+          GEP->getSourceElementType() == ElemTy) {
         V = GEP->getOperand(0);
         Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
         Index = ConstantExpr::getAdd(
@@ -798,17 +792,14 @@
   if (!GEPLHS->hasAllConstantIndices())
     return nullptr;
 
-  // Make sure the pointers have the same type.
-  if (GEPLHS->getType() != RHS->getType())
-    return nullptr;
-
+  Type *ElemTy = GEPLHS->getSourceElementType();
   Value *PtrBase, *Index;
-  std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
+  std::tie(PtrBase, Index) = getAsConstantIndexedAddress(ElemTy, GEPLHS, DL);
 
   // The set of nodes that will take part in this transformation.
   SetVector<Value *> Nodes;
 
-  if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
+  if (!canRewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes))
     return nullptr;
 
   // We know we can re-write this as
@@ -817,7 +808,7 @@
   // can't have overflow on either side. We can therefore re-write
   // this as:
   //   OFFSET1 cmp OFFSET2
-  Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
+  Value *NewRHS = rewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes);
 
   // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
   // GEP having PtrBase as the pointer base, and has returned in NewRHS the
@@ -894,9 +885,10 @@
     // If the base pointers are different, but the indices are the same, just
     // compare the base pointer.
     if (PtrBase != GEPRHS->getOperand(0)) {
-      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
-      IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
-                        GEPRHS->getOperand(0)->getType();
+      bool IndicesTheSame =
+          GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
+          GEPLHS->getType() == GEPRHS->getType() &&
+          GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType();
       if (IndicesTheSame)
         for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
           if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
@@ -3957,6 +3949,33 @@
       (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
     return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
 
+  {
+    // Similar to above: an unsigned overflow comparison may use offset + mask:
+    // ((Op1 + C) & C) u<  Op1 --> Op1 != 0
+    // ((Op1 + C) & C) u>= Op1 --> Op1 == 0
+    // Op0 u>  ((Op0 + C) & C) --> Op0 != 0
+    // Op0 u<= ((Op0 + C) & C) --> Op0 == 0
+    BinaryOperator *BO;
+    const APInt *C;
+    if ((Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE) &&
+        match(Op0, m_And(m_BinOp(BO), m_LowBitMask(C))) &&
+        match(BO, m_Add(m_Specific(Op1), m_SpecificIntAllowUndef(*C)))) {
+      CmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+      Constant *Zero = ConstantInt::getNullValue(Op1->getType());
+      return new ICmpInst(NewPred, Op1, Zero);
+    }
+
+    if ((Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE) &&
+        match(Op1, m_And(m_BinOp(BO), m_LowBitMask(C))) &&
+        match(BO, m_Add(m_Specific(Op0), m_SpecificIntAllowUndef(*C)))) {
+      CmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_UGT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+      Constant *Zero = ConstantInt::getNullValue(Op1->getType());
+      return new ICmpInst(NewPred, Op0, Zero);
+    }
+  }
+
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
   if (BO0 && isa<OverflowingBinaryOperator>(BO0))
     NoOp0WrapProblem =
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -148,6 +148,7 @@
   Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
   Instruction *visitPHINode(PHINode &PN);
   Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+  Instruction *visitGEPOfGEP(GetElementPtrInst &GEP, GEPOperator *Src);
   Instruction *visitAllocaInst(AllocaInst &AI);
   Instruction *visitAllocSite(Instruction &FI);
   Instruction *visitFree(CallInst &FI);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -495,8 +495,7 @@
           }
 
           GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
-              cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr,
-              NewOps);
+              GEP->getSourceElementType(), NewPtr, NewOps);
           NewGEP->setIsInBounds(GEP->isInBounds());
           return NewGEP;
         }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1884,6 +1884,139 @@
   return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
 }
 
+Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
+                                             GEPOperator *Src) {
+  // Combine Indices - If the source pointer to this getelementptr instruction
+  // is a getelementptr instruction with matching element type, combine the
+  // indices of the two getelementptr instructions into a single instruction.
+  if (Src->getResultElementType() != GEP.getSourceElementType())
+    return nullptr;
+
+  if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
+    return nullptr;
+
+  if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
+      Src->hasOneUse()) {
+    Value *GO1 = GEP.getOperand(1);
+    Value *SO1 = Src->getOperand(1);
+
+    if (LI) {
+      // Try to reassociate loop invariant GEP chains to enable LICM.
+      if (Loop *L = LI->getLoopFor(GEP.getParent())) {
+        // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
+        // invariant: this breaks the dependence between GEPs and allows LICM
+        // to hoist the invariant part out of the loop.
+        if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
+          // We have to be careful here.
+          // We have something like:
+          //  %src = getelementptr <ty>, <ty>* %base, <ty> %idx
+          //  %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
+          // If we just swap idx & idx2 then we could inadvertantly
+          // change %src from a vector to a scalar, or vice versa.
+          // Cases:
+          //  1) %base a scalar & idx a scalar & idx2 a vector
+          //      => Swapping idx & idx2 turns %src into a vector type.
+          //  2) %base a scalar & idx a vector & idx2 a scalar
+          //      => Swapping idx & idx2 turns %src in a scalar type
+          //  3) %base, %idx, and %idx2 are scalars
+          //      => %src & %gep are scalars
+          //      => swapping idx & idx2 is safe
+          //  4) %base a vector
+          //      => %src is a vector
+          //      => swapping idx & idx2 is safe.
+          auto *SO0 = Src->getOperand(0);
+          auto *SO0Ty = SO0->getType();
+          if (!isa<VectorType>(GEP.getType()) || // case 3
+              isa<VectorType>(SO0Ty)) { // case 4
+            Src->setOperand(1, GO1);
+            GEP.setOperand(1, SO1);
+            return &GEP;
+          } else {
+            // Case 1 or 2
+            // -- have to recreate %src & %gep
+            // put NewSrc at same location as %src
+            Builder.SetInsertPoint(cast<Instruction>(Src));
+            Value *NewSrc = Builder.CreateGEP(
+                GEP.getSourceElementType(), SO0, GO1, Src->getName());
+            // Propagate 'inbounds' if the new source was not constant-folded.
+            if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
+              NewSrcGEPI->setIsInBounds(Src->isInBounds());
+            GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+                GEP.getSourceElementType(), NewSrc, {SO1});
+            NewGEP->setIsInBounds(GEP.isInBounds());
+            return NewGEP;
+          }
+        }
+      }
+    }
+  }
+
+  // Note that if our source is a gep chain itself then we wait for that
+  // chain to be resolved before we perform this transformation.  This
+  // avoids us creating a TON of code in some cases.
+  if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
+    if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
+      return nullptr;   // Wait until our source is folded to completion.
+
+  SmallVector<Value*, 8> Indices;
+
+  // Find out whether the last index in the source GEP is a sequential idx.
+  bool EndsWithSequential = false;
+  for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
+       I != E; ++I)
+    EndsWithSequential = I.isSequential();
+
+  // Can we combine the two pointer arithmetics offsets?
+  if (EndsWithSequential) {
+    // Replace: gep (gep %P, long B), long A, ...
+    // With:    T = long A+B; gep %P, T, ...
+    Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
+    Value *GO1 = GEP.getOperand(1);
+
+    // If they aren't the same type, then the input hasn't been processed
+    // by the loop above yet (which canonicalizes sequential index types to
+    // intptr_t).  Just avoid transforming this until the input has been
+    // normalized.
+    if (SO1->getType() != GO1->getType())
+      return nullptr;
+
+    Value *Sum =
+        SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+    // Only do the combine when we are sure the cost after the
+    // merge is never more than that before the merge.
+    if (Sum == nullptr)
+      return nullptr;
+
+    // Update the GEP in place if possible.
+    if (Src->getNumOperands() == 2) {
+      GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
+      replaceOperand(GEP, 0, Src->getOperand(0));
+      replaceOperand(GEP, 1, Sum);
+      return &GEP;
+    }
+    Indices.append(Src->op_begin()+1, Src->op_end()-1);
+    Indices.push_back(Sum);
+    Indices.append(GEP.op_begin()+2, GEP.op_end());
+  } else if (isa<Constant>(*GEP.idx_begin()) &&
+             cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+             Src->getNumOperands() != 1) {
+    // Otherwise we can do the fold if the first index of the GEP is a zero
+    Indices.append(Src->op_begin()+1, Src->op_end());
+    Indices.append(GEP.idx_begin()+1, GEP.idx_end());
+  }
+
+  if (!Indices.empty())
+    return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
+               ? GetElementPtrInst::CreateInBounds(
+                     Src->getSourceElementType(), Src->getOperand(0), Indices,
+                     GEP.getName())
+               : GetElementPtrInst::Create(Src->getSourceElementType(),
+                                           Src->getOperand(0), Indices,
+                                           GEP.getName());
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value *, 8> Ops(GEP.operands());
   Type *GEPType = GEP.getType();
@@ -2063,132 +2196,9 @@
     PtrOp = NewGEP;
   }
 
-  // Combine Indices - If the source pointer to this getelementptr instruction
-  // is a getelementptr instruction, combine the indices of the two
-  // getelementptr instructions into a single instruction.
-  if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
-    if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
-      return nullptr;
-
-    if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
-        Src->hasOneUse()) {
-      Value *GO1 = GEP.getOperand(1);
-      Value *SO1 = Src->getOperand(1);
-
-      if (LI) {
-        // Try to reassociate loop invariant GEP chains to enable LICM.
-        if (Loop *L = LI->getLoopFor(GEP.getParent())) {
-          // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
-          // invariant: this breaks the dependence between GEPs and allows LICM
-          // to hoist the invariant part out of the loop.
-          if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
-            // We have to be careful here.
-            // We have something like:
-            //  %src = getelementptr <ty>, <ty>* %base, <ty> %idx
-            //  %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
-            // If we just swap idx & idx2 then we could inadvertantly
-            // change %src from a vector to a scalar, or vice versa.
-            // Cases:
-            //  1) %base a scalar & idx a scalar & idx2 a vector
-            //      => Swapping idx & idx2 turns %src into a vector type.
-            //  2) %base a scalar & idx a vector & idx2 a scalar
-            //      => Swapping idx & idx2 turns %src in a scalar type
-            //  3) %base, %idx, and %idx2 are scalars
-            //      => %src & %gep are scalars
-            //      => swapping idx & idx2 is safe
-            //  4) %base a vector
-            //      => %src is a vector
-            //      => swapping idx & idx2 is safe.
-            auto *SO0 = Src->getOperand(0);
-            auto *SO0Ty = SO0->getType();
-            if (!isa<VectorType>(GEPType) || // case 3
-                isa<VectorType>(SO0Ty)) {    // case 4
-              Src->setOperand(1, GO1);
-              GEP.setOperand(1, SO1);
-              return &GEP;
-            } else {
-              // Case 1 or 2
-              // -- have to recreate %src & %gep
-              // put NewSrc at same location as %src
-              Builder.SetInsertPoint(cast<Instruction>(PtrOp));
-              Value *NewSrc =
-                  Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName());
-              // Propagate 'inbounds' if the new source was not constant-folded.
-              if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
-                NewSrcGEPI->setIsInBounds(Src->isInBounds());
-              GetElementPtrInst *NewGEP =
-                  GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
-              NewGEP->setIsInBounds(GEP.isInBounds());
-              return NewGEP;
-            }
-          }
-        }
-      }
-    }
-
-    // Note that if our source is a gep chain itself then we wait for that
-    // chain to be resolved before we perform this transformation.  This
-    // avoids us creating a TON of code in some cases.
-    if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
-      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
-        return nullptr;   // Wait until our source is folded to completion.
-
-    SmallVector<Value*, 8> Indices;
-
-    // Find out whether the last index in the source GEP is a sequential idx.
-    bool EndsWithSequential = false;
-    for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
-         I != E; ++I)
-      EndsWithSequential = I.isSequential();
-
-    // Can we combine the two pointer arithmetics offsets?
-    if (EndsWithSequential) {
-      // Replace: gep (gep %P, long B), long A, ...
-      // With:    T = long A+B; gep %P, T, ...
-      Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
-      Value *GO1 = GEP.getOperand(1);
-
-      // If they aren't the same type, then the input hasn't been processed
-      // by the loop above yet (which canonicalizes sequential index types to
-      // intptr_t).  Just avoid transforming this until the input has been
-      // normalized.
-      if (SO1->getType() != GO1->getType())
-        return nullptr;
-
-      Value *Sum =
-          SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
-      // Only do the combine when we are sure the cost after the
-      // merge is never more than that before the merge.
-      if (Sum == nullptr)
-        return nullptr;
-
-      // Update the GEP in place if possible.
-      if (Src->getNumOperands() == 2) {
-        GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
-        replaceOperand(GEP, 0, Src->getOperand(0));
-        replaceOperand(GEP, 1, Sum);
-        return &GEP;
-      }
-      Indices.append(Src->op_begin()+1, Src->op_end()-1);
-      Indices.push_back(Sum);
-      Indices.append(GEP.op_begin()+2, GEP.op_end());
-    } else if (isa<Constant>(*GEP.idx_begin()) &&
-               cast<Constant>(*GEP.idx_begin())->isNullValue() &&
-               Src->getNumOperands() != 1) {
-      // Otherwise we can do the fold if the first index of the GEP is a zero
-      Indices.append(Src->op_begin()+1, Src->op_end());
-      Indices.append(GEP.idx_begin()+1, GEP.idx_end());
-    }
-
-    if (!Indices.empty())
-      return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
-                 ? GetElementPtrInst::CreateInBounds(
-                       Src->getSourceElementType(), Src->getOperand(0), Indices,
-                       GEP.getName())
-                 : GetElementPtrInst::Create(Src->getSourceElementType(),
-                                             Src->getOperand(0), Indices,
-                                             GEP.getName());
-  }
+  if (auto *Src = dyn_cast<GEPOperator>(PtrOp))
+    if (Instruction *I = visitGEPOfGEP(GEP, Src))
+      return I;
 
   // Skip if GEP source element type is scalable. The type alloc size is unknown
   // at compile-time.
@@ -2234,7 +2244,11 @@
   Value *StrippedPtr = PtrOp->stripPointerCasts();
   PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
 
-  if (StrippedPtr != PtrOp) {
+  // TODO: The basic approach of these folds is not compatible with opaque
+  // pointers, because we can't use bitcasts as a hint for a desirable GEP
+  // type. Instead, we should perform canonicalization directly on the GEP
+  // type. For now, skip these.
+  if (StrippedPtr != PtrOp && !StrippedPtrTy->isOpaque()) {
     bool HasZeroPointerIndex = false;
     Type *StrippedPtrEltTy = StrippedPtrTy->getElementType();
 
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -982,6 +982,7 @@
   assert(isLoopCounter(IndVar, L, SE));
   const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
   const SCEV *IVInit = AR->getStart();
+  assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
 
   // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter
   // finds a valid pointer IV. Sign extend ExitCount in order to materialize a
@@ -1004,13 +1005,6 @@
     assert(SE->isLoopInvariant(IVOffset, L) &&
            "Computed iteration count is not loop invariant!");
 
-    // We could handle pointer IVs other than i8*, but we need to compensate for
-    // gep index scaling.
-    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
-                             cast<PointerType>(IndVar->getType())
-                                 ->getElementType())->isOne() &&
-           "unit stride pointer IV must be i8*");
-
     const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset);
     BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
     return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI);
@@ -1026,7 +1020,6 @@
     // IVInit integer and ExitCount pointer would only occur if a canonical IV
     // were generated on top of case #2, which is not expected.
 
-    assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
     // For unit stride, IVCount = Start + ExitCount with 2's complement
     // overflow.
 
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1142,7 +1142,7 @@
   // automatic unrolling from interfering with the user requested
   // transformation.
   Loop *ParentL = L->getParentLoop();
-  if (ParentL != NULL &&
+  if (ParentL != nullptr &&
       hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
       hasUnrollTransformation(L) != TM_ForcedByUser) {
     LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1322,11 +1322,11 @@
 Value *NewGVN::lookupOperandLeader(Value *V) const {
   CongruenceClass *CC = ValueToClass.lookup(V);
   if (CC) {
-    // Everything in TOP is represented by undef, as it can be any value.
+    // Everything in TOP is represented by poison, as it can be any value.
     // We do have to make sure we get the type right though, so we can't set the
-    // RepLeader to undef.
+    // RepLeader to poison.
     if (CC == TOPClass)
-      return UndefValue::get(V->getType());
+      return PoisonValue::get(V->getType());
     return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
   }
 
@@ -1521,9 +1521,9 @@
     return nullptr;
 
   Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
-  // Load of undef is undef.
+  // Load of undef is UB.
   if (isa<UndefValue>(LoadAddressLeader))
-    return createConstantExpression(UndefValue::get(LI->getType()));
+    return createConstantExpression(PoisonValue::get(LI->getType()));
   MemoryAccess *OriginalAccess = getMemoryAccess(I);
   MemoryAccess *DefiningAccess =
       MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
@@ -1531,9 +1531,9 @@
   if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
     if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
       Instruction *DefiningInst = MD->getMemoryInst();
-      // If the defining instruction is not reachable, replace with undef.
+      // If the defining instruction is not reachable, replace with poison.
       if (!ReachableBlocks.count(DefiningInst->getParent()))
-        return createConstantExpression(UndefValue::get(LI->getType()));
+        return createConstantExpression(PoisonValue::get(LI->getType()));
       // This will handle stores and memory insts.  We only do if it the
       // defining access has a different type, or it is a pointer produced by
       // certain memory operations that cause the memory to have a fixed value
@@ -1722,8 +1722,12 @@
   // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
   // See if all arguments are the same.
   // We track if any were undef because they need special handling.
-  bool HasUndef = false;
+  bool HasUndef = false, HasPoison = false;
   auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
+    if (isa<PoisonValue>(Arg)) {
+      HasPoison = true;
+      return false;
+    }
     if (isa<UndefValue>(Arg)) {
       HasUndef = true;
       return false;
@@ -1732,8 +1736,14 @@
   });
   // If we are left with no operands, it's dead.
   if (Filtered.empty()) {
-    // If it has undef at this point, it means there are no-non-undef arguments,
-    // and thus, the value of the phi node must be undef.
+    // If it has undef or poison at this point, it means there are no-non-undef
+    // arguments, and thus, the value of the phi node must be undef.
+    if (HasPoison && !HasUndef) {
+      LLVM_DEBUG(
+          dbgs() << "PHI Node " << *I
+                 << " has no non-poison arguments, valuing it as poison\n");
+      return createConstantExpression(PoisonValue::get(I->getType()));
+    }
     if (HasUndef) {
       LLVM_DEBUG(
           dbgs() << "PHI Node " << *I
@@ -1758,7 +1768,7 @@
     // expression to say if one is equivalent to the other.
     // We also special case undef, so that if we have an undef, we can't use the
     // common value unless it dominates the phi block.
-    if (HasUndef) {
+    if (HasPoison || HasUndef) {
       // If we have undef and at least one other value, this is really a
       // multivalued phi, and we need to know if it's cycle free in order to
       // evaluate whether we can ignore the undef.  The other parts of this are
@@ -2780,7 +2790,7 @@
       LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
                         << getBlockName(PredBB)
                         << " because the block is unreachable\n");
-      FoundVal = UndefValue::get(I->getType());
+      FoundVal = PoisonValue::get(I->getType());
       RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
     }
 
@@ -3459,7 +3469,7 @@
   // Delete all instructions marked for deletion.
   for (Instruction *ToErase : InstructionsToErase) {
     if (!ToErase->use_empty())
-      ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
+      ToErase->replaceAllUsesWith(PoisonValue::get(ToErase->getType()));
 
     assert(ToErase->getParent() &&
            "BB containing ToErase deleted unexpectedly!");
@@ -3677,7 +3687,7 @@
   for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
     Instruction &Inst = *I++;
     if (!Inst.use_empty())
-      Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+      Inst.replaceAllUsesWith(PoisonValue::get(Inst.getType()));
     if (isa<LandingPadInst>(Inst))
       continue;
     salvageKnowledge(&Inst, AC);
@@ -3687,7 +3697,7 @@
   }
   // Now insert something that simplifycfg will turn into an unreachable.
   Type *Int8Ty = Type::getInt8Ty(BB->getContext());
-  new StoreInst(UndefValue::get(Int8Ty),
+  new StoreInst(PoisonValue::get(Int8Ty),
                 Constant::getNullValue(Int8Ty->getPointerTo()),
                 BB->getTerminator());
 }
@@ -3827,8 +3837,8 @@
         LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
                           << " for block "
                           << getBlockName(PHI->getIncomingBlock(Operand))
-                          << " with undef due to it being unreachable\n");
-        Operand.set(UndefValue::get(PHI->getType()));
+                          << " with poison due to it being unreachable\n");
+        Operand.set(PoisonValue::get(PHI->getType()));
       }
   };
   // Replace unreachable phi arguments.
@@ -4128,21 +4138,25 @@
 unsigned int NewGVN::getRank(const Value *V) const {
   // Prefer constants to undef to anything else
   // Undef is a constant, have to check it first.
+  // Prefer poison to undef as it's less defined.
   // Prefer smaller constants to constantexprs
+  // Note that the order here matters because of class inheritance
   if (isa<ConstantExpr>(V))
-    return 2;
-  if (isa<UndefValue>(V))
+    return 3;
+  if (isa<PoisonValue>(V))
     return 1;
+  if (isa<UndefValue>(V))
+    return 2;
   if (isa<Constant>(V))
     return 0;
-  else if (auto *A = dyn_cast<Argument>(V))
-    return 3 + A->getArgNo();
+  if (auto *A = dyn_cast<Argument>(V))
+    return 4 + A->getArgNo();
 
-  // Need to shift the instruction DFS by number of arguments + 3 to account for
+  // Need to shift the instruction DFS by number of arguments + 5 to account for
   // the constant and argument ranking above.
   unsigned Result = InstrToDFSNum(V);
   if (Result > 0)
-    return 4 + NumFuncArgs + Result;
+    return 5 + NumFuncArgs + Result;
   // Unreachable or something else, just return a really large number.
   return ~0;
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -582,7 +582,8 @@
 
   /// Create the exit value of first order recurrences in the middle block and
   /// update their users.
-  void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
+  void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
+                               VPTransformState &State);
 
   /// Create code for the loop exit value of the reduction.
   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
@@ -4096,8 +4097,8 @@
   }
 }
 
-void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
-                                                  VPTransformState &State) {
+void InnerLoopVectorizer::fixFirstOrderRecurrence(
+    VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
   // This is the second phase of vectorizing first-order recurrences. An
   // overview of the transformation is described below. Suppose we have the
   // following loop.
@@ -7961,7 +7962,6 @@
   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
-  State.TripCount = ILV.getOrCreateTripCount(nullptr);
   State.CanonicalIV = ILV.Induction;
   ILV.collectPoisonGeneratingRecipes(State);
 
@@ -7976,6 +7976,7 @@
   //===------------------------------------------------===//
 
   // 2. Copy and widen instructions from the old loop into the new loop.
+  BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), State);
   BestVPlan.execute(&State);
 
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
@@ -8457,16 +8458,13 @@
     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
 
-    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
-    bool TailFolded = !CM.isScalarEpilogueAllowed();
+    assert(CM.foldTailByMasking() && "must fold the tail");
 
-    if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
-      // While ActiveLaneMask is a binary op that consumes the loop tripcount
-      // as a second argument, we only pass the IV here and extract the
-      // tripcount from the transform state where codegen of the VP instructions
-      // happen.
-      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
+    if (CM.TTI.emitGetActiveLaneMask()) {
+      VPValue *TC = Plan->getOrCreateTripCount();
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
     } else {
+      VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     }
     return BlockMaskCache[BB] = BlockMask;
@@ -8712,7 +8710,7 @@
 
 void VPRecipeBuilder::fixHeaderPhis() {
   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
-  for (VPWidenPHIRecipe *R : PhisToFix) {
+  for (VPHeaderPHIRecipe *R : PhisToFix) {
     auto *PN = cast<PHINode>(R->getUnderlyingValue());
     VPRecipeBase *IncR =
         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
@@ -8854,7 +8852,7 @@
     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
       return toVPRecipeResult(Recipe);
 
-    VPWidenPHIRecipe *PhiRecipe = nullptr;
+    VPHeaderPHIRecipe *PhiRecipe = nullptr;
     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
       VPValue *StartV = Operands[0];
       if (Legal->isReductionVariable(Phi)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -59,7 +59,7 @@
   /// Cross-iteration reduction & first-order recurrence phis for which we need
   /// to add the incoming value from the backedge after all recipes have been
   /// created.
-  SmallVector<VPWidenPHIRecipe *, 4> PhisToFix;
+  SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -341,9 +341,6 @@
   /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
   Value *CanonicalIV = nullptr;
 
-  /// Hold the trip count of the scalar loop.
-  Value *TripCount = nullptr;
-
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 
@@ -1059,34 +1056,21 @@
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 };
 
-/// A recipe for handling first order recurrences and pointer inductions. For
-/// first-order recurrences, the start value is the first operand of the recipe
-/// and the incoming value from the backedge is the second operand. It also
-/// serves as base class for VPReductionPHIRecipe. In the VPlan native path, all
-/// incoming VPValues & VPBasicBlock pairs are managed in the recipe directly.
-class VPWidenPHIRecipe : public VPRecipeBase, public VPValue {
-  /// List of incoming blocks. Only used in the VPlan native path.
-  SmallVector<VPBasicBlock *, 2> IncomingBlocks;
-
+/// A pure virtual base class for all recipes modeling header phis, including
+/// phis for first order recurrences, pointer inductions and reductions. The
+/// start value is the first operand of the recipe and the incoming value from
+/// the backedge is the second operand.
+class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {
 protected:
-  VPWidenPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi,
-                   VPValue *Start = nullptr)
+  VPHeaderPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi,
+                    VPValue *Start = nullptr)
       : VPRecipeBase(VPDefID, {}), VPValue(VPVID, Phi, this) {
     if (Start)
       addOperand(Start);
   }
 
 public:
-  /// Create a VPWidenPHIRecipe for \p Phi
-  VPWidenPHIRecipe(PHINode *Phi)
-      : VPWidenPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {}
-
-  /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
-  VPWidenPHIRecipe(PHINode *Phi, VPValue &Start) : VPWidenPHIRecipe(Phi) {
-    addOperand(&Start);
-  }
-
-  ~VPWidenPHIRecipe() override = default;
+  ~VPHeaderPHIRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPRecipeBase *B) {
@@ -1100,23 +1084,21 @@
            V->getVPValueID() == VPValue::VPVReductionPHISC;
   }
 
-  /// Generate the phi/select nodes.
-  void execute(VPTransformState &State) override;
+  /// Generate the phi nodes.
+  void execute(VPTransformState &State) override = 0;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+             VPSlotTracker &SlotTracker) const override = 0;
 #endif
 
-  /// Returns the start value of the phi, if it is a reduction or first-order
-  /// recurrence.
+  /// Returns the start value of the phi, if one is set.
   VPValue *getStartValue() {
     return getNumOperands() == 0 ? nullptr : getOperand(0);
   }
 
-  /// Returns the incoming value from the loop backedge, if it is a reduction or
-  /// first-order recurrence.
+  /// Returns the incoming value from the loop backedge.
   VPValue *getBackedgeValue() {
     return getOperand(1);
   }
@@ -1126,6 +1108,43 @@
   VPRecipeBase *getBackedgeRecipe() {
     return cast<VPRecipeBase>(getBackedgeValue()->getDef());
   }
+};
+
+/// A recipe for handling header phis that are widened in the vector loop.
+/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
+/// managed in the recipe directly.
+class VPWidenPHIRecipe : public VPHeaderPHIRecipe {
+  /// List of incoming blocks. Only used in the VPlan native path.
+  SmallVector<VPBasicBlock *, 2> IncomingBlocks;
+
+public:
+  /// Create a VPWidenPHIRecipe for \p Phi
+  VPWidenPHIRecipe(PHINode *Phi)
+      : VPHeaderPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {}
+
+  /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
+  VPWidenPHIRecipe(PHINode *Phi, VPValue &Start) : VPWidenPHIRecipe(Phi) {
+    addOperand(&Start);
+  }
+
+  ~VPWidenPHIRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *B) {
+    return B->getVPDefID() == VPRecipeBase::VPWidenPHISC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVWidenPHISC;
+  }
+
+  /// Generate the phi/select nodes.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
 
   /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
   void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
@@ -1133,20 +1152,20 @@
     IncomingBlocks.push_back(IncomingBlock);
   }
 
-  /// Returns the \p I th incoming VPValue.
-  VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
-
   /// Returns the \p I th incoming VPBasicBlock.
   VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
+
+  /// Returns the \p I th incoming VPValue.
+  VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
 };
 
 /// A recipe for handling first-order recurrence phis. The start value is the
 /// first operand of the recipe and the incoming value from the backedge is the
 /// second operand.
-struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
+struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
   VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
-      : VPWidenPHIRecipe(VPVFirstOrderRecurrencePHISC,
-                         VPFirstOrderRecurrencePHISC, Phi, &Start) {}
+      : VPHeaderPHIRecipe(VPVFirstOrderRecurrencePHISC,
+                          VPFirstOrderRecurrencePHISC, Phi, &Start) {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPRecipeBase *R) {
@@ -1171,7 +1190,7 @@
 /// A recipe for handling reduction phis. The start value is the first operand
 /// of the recipe and the incoming value from the backedge is the second
 /// operand.
-class VPReductionPHIRecipe : public VPWidenPHIRecipe {
+class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   /// Descriptor for the reduction.
   const RecurrenceDescriptor &RdxDesc;
 
@@ -1187,7 +1206,7 @@
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
                        bool IsOrdered = false)
-      : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
+      : VPHeaderPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
         RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
@@ -2112,8 +2131,12 @@
   // (operators '==' and '<').
   SetVector<VPValue *> VPExternalDefs;
 
-  /// Represents the backedge taken count of the original loop, for folding
+  /// Represents the trip count of the original loop, for folding
   /// the tail.
+  VPValue *TripCount = nullptr;
+
+  /// Represents the backedge taken count of the original loop, for folding
+  /// the tail. It equals TripCount - 1.
   VPValue *BackedgeTakenCount = nullptr;
 
   /// Holds a mapping between Values and their corresponding VPValue inside
@@ -2147,12 +2170,17 @@
     }
     for (VPValue *VPV : VPValuesToFree)
       delete VPV;
+    if (TripCount)
+      delete TripCount;
     if (BackedgeTakenCount)
       delete BackedgeTakenCount;
     for (VPValue *Def : VPExternalDefs)
       delete Def;
   }
 
+  /// Prepare the plan for execution, setting up the required live-in values.
+  void prepareToExecute(Value *TripCount, VPTransformState &State);
+
   /// Generate the IR code for this VPlan.
   void execute(struct VPTransformState *State);
 
@@ -2165,6 +2193,13 @@
     return Entry;
   }
 
+  /// The trip count of the original loop.
+  VPValue *getOrCreateTripCount() {
+    if (!TripCount)
+      TripCount = new VPValue();
+    return TripCount;
+  }
+
   /// The backedge taken count of the original loop.
   VPValue *getOrCreateBackedgeTakenCount() {
     if (!BackedgeTakenCount)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -677,7 +677,7 @@
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
     // Get the original loop tripcount.
-    Value *ScalarTC = State.TripCount;
+    Value *ScalarTC = State.get(getOperand(1), Part);
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
     auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
@@ -786,23 +786,31 @@
   FMF = FMFNew;
 }
 
-/// Generate the code inside the body of the vectorized loop. Assumes a single
-/// LoopVectorBody basic-block was created for this. Introduce additional
-/// basic-blocks as needed, and fill them all.
-void VPlan::execute(VPTransformState *State) {
-  // -1. Check if the backedge taken count is needed, and if so build it.
+void VPlan::prepareToExecute(Value *TripCountV, VPTransformState &State) {
+  // Check if the trip count is needed, and if so build it.
+  if (TripCount && TripCount->getNumUsers()) {
+    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+      State.set(TripCount, TripCountV, Part);
+  }
+
+  // Check if the backedge taken count is needed, and if so build it.
   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
-    Value *TC = State->TripCount;
-    IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
-    auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+    IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+    auto *TCMO = Builder.CreateSub(TripCountV,
+                                   ConstantInt::get(TripCountV->getType(), 1),
                                    "trip.count.minus.1");
-    auto VF = State->VF;
+    auto VF = State.VF;
     Value *VTCMO =
         VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
-    for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
-      State->set(BackedgeTakenCount, VTCMO, Part);
+    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+      State.set(BackedgeTakenCount, VTCMO, Part);
   }
+}
 
+/// Generate the code inside the body of the vectorized loop. Assumes a single
+/// LoopVectorBody basic-block was created for this. Introduce additional
+/// basic-blocks as needed, and fill them all.
+void VPlan::execute(VPTransformState *State) {
   // 0. Set the reverse mapping from VPValues to Values for code generation.
   for (auto &Entry : Value2VPValue)
     State->VPValue2Value[Entry.second] = Entry.first;
@@ -838,7 +846,7 @@
   // vector loop.
   VPBasicBlock *Header = Entry->getEntryBasicBlock();
   for (VPRecipeBase &R : Header->phis()) {
-    auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
+    auto *PhiR = dyn_cast<VPHeaderPHIRecipe>(&R);
     if (!PhiR || !(isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
                    isa<VPReductionPHIRecipe>(&R)))
       continue;
@@ -1461,7 +1469,7 @@
                                          InterleavedAccessInfo &IAI) {
   if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
     for (VPRecipeBase &VPI : *VPBB) {
-      if (isa<VPWidenPHIRecipe>(&VPI))
+      if (isa<VPHeaderPHIRecipe>(&VPI))
         continue;
       assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
       auto *VPInst = cast<VPInstruction>(&VPI);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -881,7 +881,8 @@
   ConstantRange IdxRange(IntWidth, true);
 
   if (isGuaranteedNotToBePoison(Idx, &AC)) {
-    if (ValidIndices.contains(computeConstantRange(Idx, true, &AC, CtxI, &DT)))
+    if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
+                                                   true, &AC, CtxI, &DT)))
       return ScalarizationResult::safe();
     return ScalarizationResult::unsafe();
   }
diff --git a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
--- a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
+++ b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
@@ -669,7 +669,7 @@
     std::unique_ptr<xmlDoc, XmlDeleter> OutputDoc(
         xmlNewDoc((const unsigned char *)"1.0"));
     xmlDocSetRootElement(OutputDoc.get(), CombinedRoot);
-    assert(0 == xmlDocGetRootElement(CombinedDoc));
+    assert(nullptr == xmlDocGetRootElement(CombinedDoc));
 
     xmlKeepBlanksDefault(0);
     xmlChar *Buff = nullptr;
diff --git a/llvm/test/Assembler/ConstantExprFold.ll b/llvm/test/Assembler/ConstantExprFold.ll
--- a/llvm/test/Assembler/ConstantExprFold.ll
+++ b/llvm/test/Assembler/ConstantExprFold.ll
@@ -1,7 +1,9 @@
-; This test checks to make sure that constant exprs fold in some simple 
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; This test checks to make sure that constant exprs fold in some simple
 ; situations
 
-; RUN: llvm-as < %s | llvm-dis | not grep "("
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
 @A = global i64 0
@@ -17,12 +19,12 @@
 @8 = global i64* inttoptr (i64 xor (i64 ptrtoint (i64* @A to i64), i64 0) to i64*) ; X ^ 0 == X
 
 %Ty = type { i32, i32 }
-@B = external global %Ty 
+@B = external global %Ty
 
 @9 = global i1 icmp ult (i64* @A, i64* getelementptr (i64, i64* @A, i64 1))        ; true
 @10 = global i1 icmp slt (i64* @A, i64* getelementptr (i64, i64* @A, i64 0))        ; false
-@11 = global i1 icmp slt (i32* getelementptr (%Ty, %Ty* @B, i64 0, i32 0),
-                   i32* getelementptr (%Ty, %Ty* @B, i64 0, i32 1))            ; true
+@11 = global i1 icmp ult (i32* getelementptr (%Ty, %Ty* @B, i64 0, i32 0),
+  i32* getelementptr (%Ty, %Ty* @B, i64 0, i32 1))            ; true
 ;global i1 icmp ne (i64* @A, i64* bitcast (%Ty* @B to i64*))                 ; true
 
 ; PR2206
@@ -33,3 +35,34 @@
 @14 = global <2 x i8*> getelementptr({ i8 }, <2 x { i8 }*> undef, <2 x i64> <i64 1, i64 1>, <2 x i32> <i32 0, i32 0>)
 @15 = global <2 x i8*> getelementptr(i8, <2 x i8*> zeroinitializer, <2 x i64> <i64 0, i64 0>)
 @16 = global <2 x i8*> getelementptr({ i8 }, <2 x { i8 }*> zeroinitializer, <2 x i64> <i64 0, i64 0>, <2 x i32> <i32 0, i32 0>)
+
+
+; Need a function to make update_test_checks.py work.
+;.
+; CHECK: @[[A:[a-zA-Z0-9_$"\\.-]+]] = global i64 0
+; CHECK: @[[GLOB0:[0-9]+]] = global i64* @A
+; CHECK: @[[GLOB1:[0-9]+]] = global i64* @A
+; CHECK: @[[GLOB2:[0-9]+]] = global i64* null
+; CHECK: @[[GLOB3:[0-9]+]] = global i64* @A
+; CHECK: @[[GLOB4:[0-9]+]] = global i64* null
+; CHECK: @[[GLOB5:[0-9]+]] = global i64* null
+; CHECK: @[[GLOB6:[0-9]+]] = global i64* @A
+; CHECK: @[[GLOB7:[0-9]+]] = global i64 -1
+; CHECK: @[[GLOB8:[0-9]+]] = global i64* @A
+; CHECK: @[[B:[a-zA-Z0-9_$"\\.-]+]] = external global [[TY:%.*]]
+; CHECK: @[[GLOB9:[0-9]+]] = global i1 true
+; CHECK: @[[GLOB10:[0-9]+]] = global i1 false
+; CHECK: @[[GLOB11:[0-9]+]] = global i1 true
+; CHECK: @[[CONS:[a-zA-Z0-9_$"\\.-]+]] = weak global i32 0, align 8
+; CHECK: @[[GLOB12:[0-9]+]] = global i64 0
+; CHECK: @[[GLOB13:[0-9]+]] = global <2 x i8*> undef
+; CHECK: @[[GLOB14:[0-9]+]] = global <2 x i8*> undef
+; CHECK: @[[GLOB15:[0-9]+]] = global <2 x i8*> zeroinitializer
+; CHECK: @[[GLOB16:[0-9]+]] = global <2 x i8*> zeroinitializer
+;.
+define void @dummy() {
+; CHECK-LABEL: @dummy(
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -1,4 +1,5 @@
-; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s --check-prefixes=CHECK,CHECK-TYPED
+; RUN: llvm-as -opaque-pointers < %s | llvm-dis -opaque-pointers | FileCheck %s --check-prefixes=CHECK,CHECK-OPAQUE
 ; RUN: verify-uselistorder < %s
 ; PR12696
 
@@ -27,7 +28,8 @@
 }
 
 define void @f5(i8* sret(i8) %0)
-; CHECK: define void @f5(i8* sret(i8) %0)
+; CHECK-TYPED: define void @f5(i8* sret(i8) %0)
+; CHECK-OPAQUE: define void @f5(ptr sret(i8) %0)
 {
         ret void;
 }
@@ -39,19 +41,22 @@
 }
 
 define void @f7(i8* noalias %0)
-; CHECK: define void @f7(i8* noalias %0)
+; CHECK-TYPED: define void @f7(i8* noalias %0)
+; CHECK-OPAQUE: define void @f7(ptr noalias %0)
 {
         ret void;
 }
 
 define void @f8(i8* byval(i8) %0)
-; CHECK: define void @f8(i8* byval(i8) %0)
+; CHECK-TYPED: define void @f8(i8* byval(i8) %0)
+; CHECK-OPAQUE: define void @f8(ptr byval(i8) %0)
 {
         ret void;
 }
 
 define void @f9(i8* nest %0)
-; CHECK: define void @f9(i8* nest %0)
+; CHECK-TYPED: define void @f9(i8* nest %0)
+; CHECK-OPAQUE: define void @f9(ptr nest %0)
 {
         ret void;
 }
@@ -99,13 +104,15 @@
 }
 
 define void @f17(i8* align 4 %0)
-; CHECK: define void @f17(i8* align 4 %0)
+; CHECK-TYPED: define void @f17(i8* align 4 %0)
+; CHECK-OPAQUE: define void @f17(ptr align 4 %0)
 {
         ret void;
 }
 
 define void @f18(i8* nocapture %0)
-; CHECK: define void @f18(i8* nocapture %0)
+; CHECK-TYPED: define void @f18(i8* nocapture %0)
+; CHECK-OPAQUE: define void @f18(ptr nocapture %0)
 {
         ret void;
 }
@@ -215,12 +222,14 @@
 }
 
 define void @f36(i8* inalloca(i8) %0) {
-; CHECK: define void @f36(i8* inalloca(i8) %0) {
+; CHECK-TYPED: define void @f36(i8* inalloca(i8) %0) {
+; CHECK-OPAQUE: define void @f36(ptr inalloca(i8) %0) {
         ret void
 }
 
 define nonnull i8* @f37(i8* nonnull %a) {
-; CHECK: define nonnull i8* @f37(i8* nonnull %a) {
+; CHECK-TYPED: define nonnull i8* @f37(i8* nonnull %a) {
+; CHECK-OPAQUE: define nonnull ptr @f37(ptr nonnull %a) {
         ret i8* %a
 }
 
@@ -231,21 +240,25 @@
 }
 
 define dereferenceable(2) i8* @f39(i8* dereferenceable(1) %a) {
-; CHECK: define dereferenceable(2) i8* @f39(i8* dereferenceable(1) %a) {
+; CHECK-TYPED: define dereferenceable(2) i8* @f39(i8* dereferenceable(1) %a) {
+; CHECK-OPAQUE: define dereferenceable(2) ptr @f39(ptr dereferenceable(1) %a) {
         ret i8* %a
 }
 
 define dereferenceable(18446744073709551606) i8* @f40(i8* dereferenceable(18446744073709551615) %a) {
-; CHECK: define dereferenceable(18446744073709551606) i8* @f40(i8* dereferenceable(18446744073709551615) %a) {
+; CHECK-TYPED: define dereferenceable(18446744073709551606) i8* @f40(i8* dereferenceable(18446744073709551615) %a) {
+; CHECK-OPAQUE: define dereferenceable(18446744073709551606) ptr @f40(ptr dereferenceable(18446744073709551615) %a) {
         ret i8* %a
 }
 
 define void @f41(i8* align 32 %0, double* align 64 %1) {
-; CHECK: define void @f41(i8* align 32 %0, double* align 64 %1) {
+; CHECK-TYPED: define void @f41(i8* align 32 %0, double* align 64 %1) {
+; CHECK-OPAQUE: define void @f41(ptr align 32 %0, ptr align 64 %1) {
         ret void
 }
 
-; CHECK: define dereferenceable_or_null(8) i8* @f42(i8* dereferenceable_or_null(8) %foo)
+; CHECK-TYPED: define dereferenceable_or_null(8) i8* @f42(i8* dereferenceable_or_null(8) %foo)
+; CHECK-OPAQUE: define dereferenceable_or_null(8) ptr @f42(ptr dereferenceable_or_null(8) %foo)
 define dereferenceable_or_null(8) i8* @f42(i8* dereferenceable_or_null(8) %foo) {
  entry:
   ret i8* %foo
@@ -287,19 +300,22 @@
   ret void
 }
 
-; CHECK: define void @f50(i8* swiftself %0)
+; CHECK-TYPED: define void @f50(i8* swiftself %0)
+; CHECK-OPAQUE: define void @f50(ptr swiftself %0)
 define void @f50(i8* swiftself %0)
 {
   ret void;
 }
 
-; CHECK: define i32 @f51(i8** swifterror %0)
+; CHECK-TYPED: define i32 @f51(i8** swifterror %0)
+; CHECK-OPAQUE: define i32 @f51(ptr swifterror %0)
 define i32 @f51(i8** swifterror %0)
 {
   ret i32 0
 }
 
-; CHECK: define i32 @f52(i32 %0, i8** swifterror %1)
+; CHECK-TYPED: define i32 @f52(i32 %0, i8** swifterror %1)
+; CHECK-OPAQUE: define i32 @f52(i32 %0, ptr swifterror %1)
 define i32 @f52(i32 %0, i8** swifterror %1)
 {
   ret i32 0
@@ -318,12 +334,14 @@
   ret float 1.0
 }
 
-; CHECK: define i8* @f54(i32 %0) #30
+; CHECK-TYPED: define i8* @f54(i32 %0) #30
+; CHECK-OPAQUE: define ptr @f54(i32 %0) #30
 define i8* @f54(i32 %0) allocsize(0) {
   ret i8* null
 }
 
-; CHECK: define i8* @f55(i32 %0, i32 %1) #31
+; CHECK-TYPED: define i8* @f55(i32 %0, i32 %1) #31
+; CHECK-OPAQUE: define ptr @f55(i32 %0, i32 %1) #31
 define i8* @f55(i32 %0, i32 %1) allocsize(0, 1) {
   ret i8* null
 }
@@ -374,7 +392,8 @@
   ret void
 }
 
-; CHECK: define void @f64(i32* preallocated(i32) %a)
+; CHECK-TYPED: define void @f64(i32* preallocated(i32) %a)
+; CHECK-OPAQUE: define void @f64(ptr preallocated(i32) %a)
 define void @f64(i32* preallocated(i32) %a)
 {
   ret void
@@ -392,7 +411,8 @@
   ret i32 %a
 }
 
-; CHECK: define void @f67(i32* byref(i32) %a)
+; CHECK-TYPED: define void @f67(i32* byref(i32) %a)
+; CHECK-OPAQUE: define void @f67(ptr byref(i32) %a)
 define void @f67(i32* byref(i32) %a)
 {
   ret void
@@ -440,7 +460,8 @@
   ret void
 }
 
-; CHECK: define void @f76(i8* swiftasync %0)
+; CHECK-TYPED: define void @f76(i8* swiftasync %0)
+; CHECK-OPAQUE: define void @f76(ptr swiftasync %0)
 define void @f76(i8* swiftasync %0)
 {
   ret void;
@@ -460,7 +481,8 @@
 
 declare void @llvm.some.intrinsic(i32*)
 define void @f79() {
-; CHECK: call void @llvm.some.intrinsic(i32* elementtype(i32) null)
+; CHECK-TYPED: call void @llvm.some.intrinsic(i32* elementtype(i32) null)
+; CHECK-OPAQUE: call void @llvm.some.intrinsic(ptr elementtype(i32) null)
   call void @llvm.some.intrinsic(i32* elementtype(i32) null)
   ret void
 }
@@ -471,6 +493,23 @@
         ret void;
 }
 
+define void @f81(i8** sret(i8*) %0)
+; CHECK-TYPED: define void @f81(i8** sret(i8*) %0)
+; CHECK-OPAQUE: define void @f81(ptr sret(ptr) %0)
+{
+        ret void;
+}
+
+define void @f82(i32* %0)
+; CHECK-TYPED: define void @f82(i32* %0)
+; CHECK-OPAQUE: define void @f82(ptr %0)
+{
+; CHECK-TYPED: call void @llvm.some.intrinsic(i32* sret(i32) %0)
+; CHECK-OPAQUE: call void @llvm.some.intrinsic(ptr sret(i32) %0)
+        call void @llvm.some.intrinsic(i32* sret(i32) %0)
+        ret void;
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { readnone }
diff --git a/llvm/test/CodeGen/AArch64/arm64-csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll
--- a/llvm/test/CodeGen/AArch64/arm64-csel.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-csel.ll
@@ -106,7 +106,6 @@
 ; CHECK-NEXT:    csel w0, w10, w9, ge
 ; CHECK-NEXT:    ret
 entry:
-; FIXME: Misspelled CHECK-NEXT
   %sub = sub nsw i32 %a, %b
   %cmp = icmp sgt i32 %sub, -1
   %sub3 = sub nsw i32 0, %sub
diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
--- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
+++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll
@@ -482,11 +482,9 @@
 ; CHECK-NEXT:    vstr.16 s0, [sp]
 ; CHECK-NEXT:    vldr.16 s0, [r0]
 ; CHECK-NEXT:    ldrb r1, [sp, #1]
-; CHECK-NEXT:    ands r1, r1, #128
 ; CHECK-NEXT:    vabs.f16 s0, s0
-; CHECK-NEXT:    movwne r1, #1
+; CHECK-NEXT:    tst r1, #128
 ; CHECK-NEXT:    vneg.f16 s2, s0
-; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    vseleq.f16 s0, s0, s2
 ; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    add sp, sp, #4
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
--- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
@@ -2379,67 +2379,42 @@
 ; CHECK-NEXT:    vorr q4, q0, q0
 ; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2lz
-; CHECK-NEXT:    mov r2, r1
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, r1, d9
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    mov r3, #0
-; CHECK-NEXT:    clz r7, r2
-; CHECK-NEXT:    movwmi r3, #1
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    vmov r0, r2, d9
+; CHECK-NEXT:    cmn r4, #-2147483647
 ; CHECK-NEXT:    mvn r3, #-2147483648
+; CHECK-NEXT:    movlo r3, r4
 ; CHECK-NEXT:    mvn r5, #-2147483648
-; CHECK-NEXT:    movne r3, r4
-; CHECK-NEXT:    cmn r4, #-2147483647
-; CHECK-NEXT:    movhs r4, r5
-; CHECK-NEXT:    lsr r7, r7, #5
-; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    mov r6, #0
+; CHECK-NEXT:    movpl r4, r5
+; CHECK-NEXT:    movpl r1, r6
 ; CHECK-NEXT:    moveq r4, r3
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    movpl r2, r6
-; CHECK-NEXT:    cmn r2, #1
-; CHECK-NEXT:    mov r3, #0
-; CHECK-NEXT:    add r2, r2, #1
-; CHECK-NEXT:    movwgt r3, #1
-; CHECK-NEXT:    clz r2, r2
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    cmn r1, #1
 ; CHECK-NEXT:    mov r3, #-2147483648
-; CHECK-NEXT:    movne r3, r4
 ; CHECK-NEXT:    mov r7, #-2147483648
+; CHECK-NEXT:    movgt r3, r4
 ; CHECK-NEXT:    cmp r4, #-2147483648
-; CHECK-NEXT:    lsr r2, r2, #5
 ; CHECK-NEXT:    movls r4, r7
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    moveq r4, r3
+; CHECK-NEXT:    cmn r1, #1
+; CHECK-NEXT:    movne r4, r3
+; CHECK-NEXT:    mov r1, r2
 ; CHECK-NEXT:    bl __aeabi_d2lz
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    movwmi r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    mvn r2, #-2147483648
-; CHECK-NEXT:    vmov.32 d0[0], r4
-; CHECK-NEXT:    movne r2, r0
 ; CHECK-NEXT:    cmn r0, #-2147483647
-; CHECK-NEXT:    movlo r5, r0
-; CHECK-NEXT:    clz r0, r1
-; CHECK-NEXT:    lsr r0, r0, #5
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    moveq r5, r2
+; CHECK-NEXT:    mvn r2, #-2147483648
+; CHECK-NEXT:    movlo r2, r0
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    movpl r1, r6
-; CHECK-NEXT:    cmn r1, #1
-; CHECK-NEXT:    add r1, r1, #1
-; CHECK-NEXT:    movwgt r6, #1
-; CHECK-NEXT:    clz r1, r1
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    movmi r5, r0
+; CHECK-NEXT:    movmi r6, r1
+; CHECK-NEXT:    moveq r5, r2
+; CHECK-NEXT:    cmn r6, #1
 ; CHECK-NEXT:    mov r0, #-2147483648
-; CHECK-NEXT:    movne r0, r5
+; CHECK-NEXT:    vmov.32 d0[0], r4
+; CHECK-NEXT:    movgt r0, r5
 ; CHECK-NEXT:    cmp r5, #-2147483648
 ; CHECK-NEXT:    movls r5, r7
-; CHECK-NEXT:    lsr r1, r1, #5
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    moveq r5, r0
+; CHECK-NEXT:    cmn r6, #1
+; CHECK-NEXT:    movne r5, r0
 ; CHECK-NEXT:    vmov.32 d0[1], r5
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, pc}
@@ -2485,63 +2460,45 @@
 define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) {
 ; CHECK-LABEL: ustest_f64i32_mm:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vorr q4, q0, q0
-; CHECK-NEXT:    vmov r0, r1, d9
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2lz
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    vmov r2, r1, d8
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    mov r3, #0
-; CHECK-NEXT:    movwmi r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    clz r3, r4
-; CHECK-NEXT:    mvn r8, #0
-; CHECK-NEXT:    movne r8, r0
+; CHECK-NEXT:    vmov r2, r12, d9
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    mvn r3, #0
 ; CHECK-NEXT:    mov r5, #0
-; CHECK-NEXT:    lsr r3, r3, #5
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    movne r8, r0
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    movpl r4, r5
-; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    movmi r3, r0
+; CHECK-NEXT:    movpl r1, r5
+; CHECK-NEXT:    moveq r3, r0
+; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    mov r6, #0
-; CHECK-NEXT:    mvn r7, #0
+; CHECK-NEXT:    mvn r4, #0
 ; CHECK-NEXT:    movwgt r6, #1
 ; CHECK-NEXT:    cmp r6, #0
-; CHECK-NEXT:    movne r6, r8
+; CHECK-NEXT:    movne r6, r3
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    moveq r6, r3
 ; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r12
 ; CHECK-NEXT:    bl __aeabi_d2lz
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    movwmi r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    clz r2, r1
-; CHECK-NEXT:    movne r7, r0
-; CHECK-NEXT:    lsr r2, r2, #5
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    movne r7, r0
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 d0[0], r6
+; CHECK-NEXT:    movmi r4, r0
 ; CHECK-NEXT:    movpl r1, r5
-; CHECK-NEXT:    clz r0, r1
+; CHECK-NEXT:    moveq r4, r0
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    movwgt r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    movne r5, r7
-; CHECK-NEXT:    lsr r0, r0, #5
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    clz r0, r4
-; CHECK-NEXT:    movne r5, r7
-; CHECK-NEXT:    vmov.32 d0[0], r5
-; CHECK-NEXT:    lsr r0, r0, #5
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    movne r6, r8
-; CHECK-NEXT:    vmov.32 d0[1], r6
+; CHECK-NEXT:    movne r5, r4
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    moveq r5, r4
+; CHECK-NEXT:    vmov.32 d0[1], r5
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %conv = fptosi <2 x double> %x to <2 x i64>
   %spec.store.select = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %conv, <2 x i64> <i64 4294967295, i64 4294967295>)
@@ -2560,129 +2517,78 @@
 ; CHECK-NEXT:    vorr q4, q0, q0
 ; CHECK-NEXT:    mov r8, #-2147483648
 ; CHECK-NEXT:    mvn r7, #-2147483648
-; CHECK-NEXT:    mov r9, #0
 ; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r5, s16
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    clz r2, r1
-; CHECK-NEXT:    movwmi r0, #1
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cmn r0, #-2147483647
 ; CHECK-NEXT:    mvn r0, #-2147483648
-; CHECK-NEXT:    lsr r2, r2, #5
-; CHECK-NEXT:    movne r0, r4
-; CHECK-NEXT:    cmn r4, #-2147483647
-; CHECK-NEXT:    movhs r4, r7
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    moveq r4, r0
+; CHECK-NEXT:    mov r9, #0
+; CHECK-NEXT:    movlo r0, r4
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    movpl r4, r7
 ; CHECK-NEXT:    movpl r1, r9
+; CHECK-NEXT:    moveq r4, r0
 ; CHECK-NEXT:    cmn r1, #1
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    add r1, r1, #1
-; CHECK-NEXT:    movwgt r0, #1
-; CHECK-NEXT:    clz r1, r1
-; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    mov r0, #-2147483648
-; CHECK-NEXT:    movne r0, r4
+; CHECK-NEXT:    movgt r0, r4
 ; CHECK-NEXT:    cmp r4, #-2147483648
 ; CHECK-NEXT:    movls r4, r8
-; CHECK-NEXT:    lsr r1, r1, #5
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    moveq r4, r0
+; CHECK-NEXT:    cmn r1, #1
+; CHECK-NEXT:    movne r4, r0
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    mov r5, r0
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    clz r2, r1
-; CHECK-NEXT:    movwmi r0, #1
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cmn r0, #-2147483647
 ; CHECK-NEXT:    mvn r0, #-2147483648
-; CHECK-NEXT:    lsr r2, r2, #5
-; CHECK-NEXT:    movne r0, r5
-; CHECK-NEXT:    cmn r5, #-2147483647
-; CHECK-NEXT:    movhs r5, r7
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    moveq r5, r0
+; CHECK-NEXT:    mov r2, #-2147483648
+; CHECK-NEXT:    movlo r0, r5
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    movpl r5, r7
 ; CHECK-NEXT:    movpl r1, r9
+; CHECK-NEXT:    moveq r5, r0
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    cmn r1, #1
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    mov r2, #-2147483648
-; CHECK-NEXT:    movwgt r0, #1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    add r0, r1, #1
-; CHECK-NEXT:    movne r2, r5
-; CHECK-NEXT:    clz r0, r0
+; CHECK-NEXT:    movgt r2, r5
 ; CHECK-NEXT:    cmp r5, #-2147483648
 ; CHECK-NEXT:    movls r5, r8
-; CHECK-NEXT:    lsr r1, r0, #5
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    moveq r5, r2
+; CHECK-NEXT:    cmn r1, #1
+; CHECK-NEXT:    movne r5, r2
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    clz r2, r1
-; CHECK-NEXT:    movwmi r0, #1
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cmn r0, #-2147483647
 ; CHECK-NEXT:    mvn r0, #-2147483648
-; CHECK-NEXT:    lsr r2, r2, #5
-; CHECK-NEXT:    movne r0, r6
-; CHECK-NEXT:    cmn r6, #-2147483647
-; CHECK-NEXT:    movhs r6, r7
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    moveq r6, r0
+; CHECK-NEXT:    mov r2, #-2147483648
+; CHECK-NEXT:    movlo r0, r6
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    movpl r6, r7
 ; CHECK-NEXT:    movpl r1, r9
+; CHECK-NEXT:    moveq r6, r0
+; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    cmn r1, #1
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    mov r2, #-2147483648
-; CHECK-NEXT:    movwgt r0, #1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    add r0, r1, #1
-; CHECK-NEXT:    movne r2, r6
-; CHECK-NEXT:    clz r0, r0
+; CHECK-NEXT:    movgt r2, r6
 ; CHECK-NEXT:    cmp r6, #-2147483648
 ; CHECK-NEXT:    movls r6, r8
-; CHECK-NEXT:    lsr r1, r0, #5
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    moveq r6, r2
+; CHECK-NEXT:    cmn r1, #1
+; CHECK-NEXT:    movne r6, r2
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    movwmi r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    mvn r2, #-2147483648
-; CHECK-NEXT:    vmov.32 d0[0], r6
-; CHECK-NEXT:    movne r2, r0
 ; CHECK-NEXT:    cmn r0, #-2147483647
-; CHECK-NEXT:    movlo r7, r0
-; CHECK-NEXT:    clz r0, r1
-; CHECK-NEXT:    vmov.32 d1[0], r5
-; CHECK-NEXT:    lsr r0, r0, #5
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    moveq r7, r2
+; CHECK-NEXT:    mvn r2, #-2147483648
+; CHECK-NEXT:    movlo r2, r0
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    movpl r1, r9
-; CHECK-NEXT:    cmn r1, #1
-; CHECK-NEXT:    add r1, r1, #1
-; CHECK-NEXT:    movwgt r9, #1
-; CHECK-NEXT:    clz r1, r1
-; CHECK-NEXT:    cmp r9, #0
+; CHECK-NEXT:    movmi r7, r0
+; CHECK-NEXT:    movmi r9, r1
+; CHECK-NEXT:    moveq r7, r2
+; CHECK-NEXT:    cmn r9, #1
 ; CHECK-NEXT:    mov r0, #-2147483648
-; CHECK-NEXT:    vmov.32 d1[1], r4
-; CHECK-NEXT:    movne r0, r7
+; CHECK-NEXT:    vmov.32 d1[0], r6
+; CHECK-NEXT:    movgt r0, r7
 ; CHECK-NEXT:    cmp r7, #-2147483648
+; CHECK-NEXT:    vmov.32 d0[0], r5
 ; CHECK-NEXT:    movls r7, r8
-; CHECK-NEXT:    lsr r1, r1, #5
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    moveq r7, r0
+; CHECK-NEXT:    cmn r9, #1
+; CHECK-NEXT:    vmov.32 d1[1], r4
+; CHECK-NEXT:    movne r7, r0
 ; CHECK-NEXT:    vmov.32 d0[1], r7
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, r9, r11, pc}
@@ -2743,115 +2649,75 @@
 define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
 ; CHECK-LABEL: ustest_f32i32_mm:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, r8, r9, r11, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vorr q4, q0, q0
+; CHECK-NEXT:    mvn r9, #0
 ; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r8, s18
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r3, #0
-; CHECK-NEXT:    movwmi r3, #1
-; CHECK-NEXT:    clz r6, r1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    mvn r3, #0
-; CHECK-NEXT:    movne r3, r2
-; CHECK-NEXT:    lsr r6, r6, #5
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    mvn r2, #0
+; CHECK-NEXT:    movmi r2, r0
 ; CHECK-NEXT:    mov r7, #0
-; CHECK-NEXT:    movne r3, r2
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    moveq r2, r0
 ; CHECK-NEXT:    movpl r1, r7
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r11, #0
-; CHECK-NEXT:    clz r1, r1
-; CHECK-NEXT:    movwgt r11, #1
-; CHECK-NEXT:    cmp r11, #0
-; CHECK-NEXT:    movne r11, r3
-; CHECK-NEXT:    lsr r1, r1, #5
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mvn r9, #0
-; CHECK-NEXT:    vmov r8, s16
-; CHECK-NEXT:    movne r11, r3
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r1, #0
-; CHECK-NEXT:    mvn r10, #0
-; CHECK-NEXT:    movwmi r1, #1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    clz r1, r4
-; CHECK-NEXT:    movne r10, r0
-; CHECK-NEXT:    mov r6, #0
-; CHECK-NEXT:    lsr r1, r1, #5
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    movne r10, r0
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    movpl r4, r7
+; CHECK-NEXT:    mov r4, #0
+; CHECK-NEXT:    movwgt r4, #1
 ; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    movwgt r6, #1
-; CHECK-NEXT:    cmp r6, #0
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    movne r6, r10
+; CHECK-NEXT:    movne r4, r2
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    moveq r4, r2
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    movwmi r2, #1
-; CHECK-NEXT:    clz r3, r1
-; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    mvn r2, #0
-; CHECK-NEXT:    movne r2, r0
-; CHECK-NEXT:    lsr r3, r3, #5
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    mov r5, #0
-; CHECK-NEXT:    movne r2, r0
-; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    movmi r2, r0
 ; CHECK-NEXT:    movpl r1, r7
-; CHECK-NEXT:    clz r0, r1
+; CHECK-NEXT:    moveq r2, r0
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    mov r5, #0
+; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    movwgt r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    lsr r0, r0, #5
-; CHECK-NEXT:    movne r5, r2
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    movne r5, r2
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    moveq r5, r2
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov r2, #0
-; CHECK-NEXT:    movwmi r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    clz r2, r1
-; CHECK-NEXT:    movne r9, r0
-; CHECK-NEXT:    vmov.32 d0[0], r5
-; CHECK-NEXT:    lsr r2, r2, #5
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    movne r9, r0
+; CHECK-NEXT:    mvn r2, #0
+; CHECK-NEXT:    movmi r2, r0
+; CHECK-NEXT:    movpl r1, r7
+; CHECK-NEXT:    moveq r2, r0
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    mov r6, #0
+; CHECK-NEXT:    movwgt r6, #1
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    movne r6, r2
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    moveq r6, r2
+; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 d1[0], r6
+; CHECK-NEXT:    movmi r9, r0
 ; CHECK-NEXT:    movpl r1, r7
-; CHECK-NEXT:    clz r0, r1
+; CHECK-NEXT:    moveq r9, r0
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    movwgt r7, #1
 ; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    vmov.32 d0[0], r5
 ; CHECK-NEXT:    movne r7, r9
-; CHECK-NEXT:    lsr r0, r0, #5
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    clz r0, r4
-; CHECK-NEXT:    movne r7, r9
-; CHECK-NEXT:    vmov.32 d1[0], r7
-; CHECK-NEXT:    lsr r0, r0, #5
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    movne r6, r10
-; CHECK-NEXT:    vmov.32 d1[1], r11
-; CHECK-NEXT:    vmov.32 d0[1], r6
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 d1[1], r4
+; CHECK-NEXT:    moveq r7, r9
+; CHECK-NEXT:    vmov.32 d0[1], r7
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    add sp, sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, r9, r11, pc}
 entry:
   %conv = fptosi <4 x float> %x to <4 x i64>
   %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
@@ -2868,136 +2734,85 @@
 ; CHECK-NEON-NEXT:    .vsave {d8, d9, d10}
 ; CHECK-NEON-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEON-NEXT:    vmov r0, s3
-; CHECK-NEON-NEXT:    vmov.f32 s20, s2
+; CHECK-NEON-NEXT:    vmov.f32 s18, s2
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s1
-; CHECK-NEON-NEXT:    vmov.f32 s18, s0
+; CHECK-NEON-NEXT:    vmov.f32 s20, s0
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEON-NEXT:    mov r4, r0
 ; CHECK-NEON-NEXT:    vmov r0, s20
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r2, #0
-; CHECK-NEON-NEXT:    movwmi r2, #1
-; CHECK-NEON-NEXT:    clz r3, r1
-; CHECK-NEON-NEXT:    cmp r2, #0
+; CHECK-NEON-NEXT:    cmn r4, #-2147483647
 ; CHECK-NEON-NEXT:    mvn r2, #-2147483648
-; CHECK-NEON-NEXT:    movne r2, r4
+; CHECK-NEON-NEXT:    movlo r2, r4
 ; CHECK-NEON-NEXT:    mvn r7, #-2147483648
-; CHECK-NEON-NEXT:    cmn r4, #-2147483647
-; CHECK-NEON-NEXT:    lsr r3, r3, #5
-; CHECK-NEON-NEXT:    movhs r4, r7
-; CHECK-NEON-NEXT:    cmp r3, #0
-; CHECK-NEON-NEXT:    moveq r4, r2
-; CHECK-NEON-NEXT:    mov r9, #0
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r2, #0
+; CHECK-NEON-NEXT:    mov r9, #0
+; CHECK-NEON-NEXT:    movpl r4, r7
 ; CHECK-NEON-NEXT:    movpl r1, r9
+; CHECK-NEON-NEXT:    moveq r4, r2
 ; CHECK-NEON-NEXT:    cmn r1, #1
-; CHECK-NEON-NEXT:    movwgt r2, #1
-; CHECK-NEON-NEXT:    add r1, r1, #1
-; CHECK-NEON-NEXT:    clz r1, r1
-; CHECK-NEON-NEXT:    cmp r2, #0
 ; CHECK-NEON-NEXT:    mov r2, #-2147483648
 ; CHECK-NEON-NEXT:    mov r8, #-2147483648
-; CHECK-NEON-NEXT:    movne r2, r4
+; CHECK-NEON-NEXT:    movgt r2, r4
 ; CHECK-NEON-NEXT:    cmp r4, #-2147483648
 ; CHECK-NEON-NEXT:    movls r4, r8
-; CHECK-NEON-NEXT:    lsr r1, r1, #5
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    moveq r4, r2
+; CHECK-NEON-NEXT:    cmn r1, #1
+; CHECK-NEON-NEXT:    movne r4, r2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEON-NEXT:    mov r5, r0
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r0, #0
-; CHECK-NEON-NEXT:    clz r2, r1
-; CHECK-NEON-NEXT:    movwmi r0, #1
-; CHECK-NEON-NEXT:    cmp r0, #0
+; CHECK-NEON-NEXT:    cmn r0, #-2147483647
 ; CHECK-NEON-NEXT:    mvn r0, #-2147483648
-; CHECK-NEON-NEXT:    lsr r2, r2, #5
-; CHECK-NEON-NEXT:    movne r0, r5
-; CHECK-NEON-NEXT:    cmn r5, #-2147483647
-; CHECK-NEON-NEXT:    movhs r5, r7
-; CHECK-NEON-NEXT:    cmp r2, #0
-; CHECK-NEON-NEXT:    moveq r5, r0
+; CHECK-NEON-NEXT:    mov r2, #-2147483648
+; CHECK-NEON-NEXT:    movlo r0, r5
 ; CHECK-NEON-NEXT:    cmp r1, #0
+; CHECK-NEON-NEXT:    movpl r5, r7
 ; CHECK-NEON-NEXT:    movpl r1, r9
+; CHECK-NEON-NEXT:    moveq r5, r0
+; CHECK-NEON-NEXT:    vmov r0, s18
 ; CHECK-NEON-NEXT:    cmn r1, #1
-; CHECK-NEON-NEXT:    mov r0, #0
-; CHECK-NEON-NEXT:    mov r2, #-2147483648
-; CHECK-NEON-NEXT:    movwgt r0, #1
-; CHECK-NEON-NEXT:    cmp r0, #0
-; CHECK-NEON-NEXT:    add r0, r1, #1
-; CHECK-NEON-NEXT:    movne r2, r5
-; CHECK-NEON-NEXT:    clz r0, r0
+; CHECK-NEON-NEXT:    movgt r2, r5
 ; CHECK-NEON-NEXT:    cmp r5, #-2147483648
 ; CHECK-NEON-NEXT:    movls r5, r8
-; CHECK-NEON-NEXT:    lsr r1, r0, #5
-; CHECK-NEON-NEXT:    vmov r0, s18
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    moveq r5, r2
+; CHECK-NEON-NEXT:    cmn r1, #1
+; CHECK-NEON-NEXT:    movne r5, r2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEON-NEXT:    mov r6, r0
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r0, #0
-; CHECK-NEON-NEXT:    clz r2, r1
-; CHECK-NEON-NEXT:    movwmi r0, #1
-; CHECK-NEON-NEXT:    cmp r0, #0
+; CHECK-NEON-NEXT:    cmn r0, #-2147483647
 ; CHECK-NEON-NEXT:    mvn r0, #-2147483648
-; CHECK-NEON-NEXT:    lsr r2, r2, #5
-; CHECK-NEON-NEXT:    movne r0, r6
-; CHECK-NEON-NEXT:    cmn r6, #-2147483647
-; CHECK-NEON-NEXT:    movhs r6, r7
-; CHECK-NEON-NEXT:    cmp r2, #0
-; CHECK-NEON-NEXT:    moveq r6, r0
+; CHECK-NEON-NEXT:    mov r2, #-2147483648
+; CHECK-NEON-NEXT:    movlo r0, r6
 ; CHECK-NEON-NEXT:    cmp r1, #0
+; CHECK-NEON-NEXT:    movpl r6, r7
 ; CHECK-NEON-NEXT:    movpl r1, r9
+; CHECK-NEON-NEXT:    moveq r6, r0
+; CHECK-NEON-NEXT:    vmov r0, s16
 ; CHECK-NEON-NEXT:    cmn r1, #1
-; CHECK-NEON-NEXT:    mov r0, #0
-; CHECK-NEON-NEXT:    mov r2, #-2147483648
-; CHECK-NEON-NEXT:    movwgt r0, #1
-; CHECK-NEON-NEXT:    cmp r0, #0
-; CHECK-NEON-NEXT:    add r0, r1, #1
-; CHECK-NEON-NEXT:    movne r2, r6
-; CHECK-NEON-NEXT:    clz r0, r0
+; CHECK-NEON-NEXT:    movgt r2, r6
 ; CHECK-NEON-NEXT:    cmp r6, #-2147483648
 ; CHECK-NEON-NEXT:    movls r6, r8
-; CHECK-NEON-NEXT:    lsr r1, r0, #5
-; CHECK-NEON-NEXT:    vmov r0, s16
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    moveq r6, r2
+; CHECK-NEON-NEXT:    cmn r1, #1
+; CHECK-NEON-NEXT:    movne r6, r2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r2, #0
-; CHECK-NEON-NEXT:    movwmi r2, #1
-; CHECK-NEON-NEXT:    cmp r2, #0
-; CHECK-NEON-NEXT:    mvn r2, #-2147483648
-; CHECK-NEON-NEXT:    vmov.32 d0[0], r6
-; CHECK-NEON-NEXT:    movne r2, r0
 ; CHECK-NEON-NEXT:    cmn r0, #-2147483647
-; CHECK-NEON-NEXT:    movlo r7, r0
-; CHECK-NEON-NEXT:    clz r0, r1
-; CHECK-NEON-NEXT:    vmov.32 d1[0], r5
-; CHECK-NEON-NEXT:    lsr r0, r0, #5
-; CHECK-NEON-NEXT:    cmp r0, #0
-; CHECK-NEON-NEXT:    moveq r7, r2
+; CHECK-NEON-NEXT:    mvn r2, #-2147483648
+; CHECK-NEON-NEXT:    movlo r2, r0
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    movpl r1, r9
-; CHECK-NEON-NEXT:    cmn r1, #1
-; CHECK-NEON-NEXT:    add r1, r1, #1
-; CHECK-NEON-NEXT:    movwgt r9, #1
-; CHECK-NEON-NEXT:    clz r1, r1
-; CHECK-NEON-NEXT:    cmp r9, #0
+; CHECK-NEON-NEXT:    movmi r7, r0
+; CHECK-NEON-NEXT:    movmi r9, r1
+; CHECK-NEON-NEXT:    moveq r7, r2
+; CHECK-NEON-NEXT:    cmn r9, #1
 ; CHECK-NEON-NEXT:    mov r0, #-2147483648
-; CHECK-NEON-NEXT:    vmov.32 d1[1], r4
-; CHECK-NEON-NEXT:    movne r0, r7
+; CHECK-NEON-NEXT:    vmov.32 d1[0], r6
+; CHECK-NEON-NEXT:    movgt r0, r7
 ; CHECK-NEON-NEXT:    cmp r7, #-2147483648
+; CHECK-NEON-NEXT:    vmov.32 d0[0], r5
 ; CHECK-NEON-NEXT:    movls r7, r8
-; CHECK-NEON-NEXT:    lsr r1, r1, #5
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    moveq r7, r0
+; CHECK-NEON-NEXT:    cmn r9, #1
+; CHECK-NEON-NEXT:    vmov.32 d1[1], r4
+; CHECK-NEON-NEXT:    movne r7, r0
 ; CHECK-NEON-NEXT:    vmov.32 d0[1], r7
 ; CHECK-NEON-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEON-NEXT:    pop {r4, r5, r6, r7, r8, r9, r11, pc}
@@ -3013,131 +2828,80 @@
 ; CHECK-FP16-NEXT:    vmov s0, r0
 ; CHECK-FP16-NEXT:    bl __fixhfdi
 ; CHECK-FP16-NEXT:    mov r4, r0
-; CHECK-FP16-NEXT:    vmov.u16 r0, d8[0]
-; CHECK-FP16-NEXT:    vmov.u16 r2, d8[2]
-; CHECK-FP16-NEXT:    cmp r1, #0
+; CHECK-FP16-NEXT:    vmov.u16 r0, d8[2]
+; CHECK-FP16-NEXT:    vmov.u16 r2, d8[0]
+; CHECK-FP16-NEXT:    cmn r4, #-2147483647
 ; CHECK-FP16-NEXT:    mvn r7, #-2147483648
 ; CHECK-FP16-NEXT:    mov r9, #0
 ; CHECK-FP16-NEXT:    mov r8, #-2147483648
 ; CHECK-FP16-NEXT:    vmov s18, r0
-; CHECK-FP16-NEXT:    mov r0, #0
-; CHECK-FP16-NEXT:    movwmi r0, #1
-; CHECK-FP16-NEXT:    vmov s0, r2
-; CHECK-FP16-NEXT:    clz r2, r1
-; CHECK-FP16-NEXT:    cmp r0, #0
 ; CHECK-FP16-NEXT:    mvn r0, #-2147483648
-; CHECK-FP16-NEXT:    movne r0, r4
-; CHECK-FP16-NEXT:    cmn r4, #-2147483647
-; CHECK-FP16-NEXT:    movhs r4, r7
-; CHECK-FP16-NEXT:    lsr r2, r2, #5
-; CHECK-FP16-NEXT:    cmp r2, #0
-; CHECK-FP16-NEXT:    moveq r4, r0
+; CHECK-FP16-NEXT:    movlo r0, r4
 ; CHECK-FP16-NEXT:    cmp r1, #0
+; CHECK-FP16-NEXT:    movpl r4, r7
 ; CHECK-FP16-NEXT:    movpl r1, r9
+; CHECK-FP16-NEXT:    moveq r4, r0
 ; CHECK-FP16-NEXT:    cmn r1, #1
-; CHECK-FP16-NEXT:    mov r0, #0
-; CHECK-FP16-NEXT:    add r1, r1, #1
-; CHECK-FP16-NEXT:    movwgt r0, #1
-; CHECK-FP16-NEXT:    clz r1, r1
-; CHECK-FP16-NEXT:    cmp r0, #0
 ; CHECK-FP16-NEXT:    mov r0, #-2147483648
-; CHECK-FP16-NEXT:    movne r0, r4
+; CHECK-FP16-NEXT:    vmov s0, r2
+; CHECK-FP16-NEXT:    movgt r0, r4
 ; CHECK-FP16-NEXT:    cmp r4, #-2147483648
 ; CHECK-FP16-NEXT:    movls r4, r8
-; CHECK-FP16-NEXT:    lsr r1, r1, #5
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    moveq r4, r0
+; CHECK-FP16-NEXT:    cmn r1, #1
+; CHECK-FP16-NEXT:    movne r4, r0
 ; CHECK-FP16-NEXT:    bl __fixhfdi
+; CHECK-FP16-NEXT:    vmov.f32 s0, s18
 ; CHECK-FP16-NEXT:    mov r5, r0
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    mov r0, #0
-; CHECK-FP16-NEXT:    clz r2, r1
-; CHECK-FP16-NEXT:    movwmi r0, #1
-; CHECK-FP16-NEXT:    cmp r0, #0
+; CHECK-FP16-NEXT:    cmn r0, #-2147483647
 ; CHECK-FP16-NEXT:    mvn r0, #-2147483648
-; CHECK-FP16-NEXT:    vmov.f32 s0, s18
-; CHECK-FP16-NEXT:    movne r0, r5
-; CHECK-FP16-NEXT:    cmn r5, #-2147483647
-; CHECK-FP16-NEXT:    lsr r2, r2, #5
-; CHECK-FP16-NEXT:    movhs r5, r7
-; CHECK-FP16-NEXT:    cmp r2, #0
-; CHECK-FP16-NEXT:    moveq r5, r0
+; CHECK-FP16-NEXT:    movlo r0, r5
 ; CHECK-FP16-NEXT:    cmp r1, #0
+; CHECK-FP16-NEXT:    movpl r5, r7
 ; CHECK-FP16-NEXT:    movpl r1, r9
+; CHECK-FP16-NEXT:    moveq r5, r0
 ; CHECK-FP16-NEXT:    cmn r1, #1
-; CHECK-FP16-NEXT:    mov r0, #0
-; CHECK-FP16-NEXT:    add r1, r1, #1
-; CHECK-FP16-NEXT:    movwgt r0, #1
-; CHECK-FP16-NEXT:    clz r1, r1
-; CHECK-FP16-NEXT:    cmp r0, #0
 ; CHECK-FP16-NEXT:    mov r0, #-2147483648
-; CHECK-FP16-NEXT:    movne r0, r5
+; CHECK-FP16-NEXT:    movgt r0, r5
 ; CHECK-FP16-NEXT:    cmp r5, #-2147483648
 ; CHECK-FP16-NEXT:    movls r5, r8
-; CHECK-FP16-NEXT:    lsr r1, r1, #5
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    moveq r5, r0
+; CHECK-FP16-NEXT:    cmn r1, #1
+; CHECK-FP16-NEXT:    movne r5, r0
 ; CHECK-FP16-NEXT:    bl __fixhfdi
+; CHECK-FP16-NEXT:    vmov.u16 r2, d8[1]
 ; CHECK-FP16-NEXT:    mov r6, r0
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    mov r0, #0
-; CHECK-FP16-NEXT:    clz r2, r1
-; CHECK-FP16-NEXT:    movwmi r0, #1
-; CHECK-FP16-NEXT:    cmp r0, #0
+; CHECK-FP16-NEXT:    cmn r0, #-2147483647
 ; CHECK-FP16-NEXT:    mvn r0, #-2147483648
-; CHECK-FP16-NEXT:    lsr r2, r2, #5
-; CHECK-FP16-NEXT:    movne r0, r6
-; CHECK-FP16-NEXT:    cmn r6, #-2147483647
-; CHECK-FP16-NEXT:    movhs r6, r7
-; CHECK-FP16-NEXT:    cmp r2, #0
-; CHECK-FP16-NEXT:    vmov.u16 r2, d8[1]
-; CHECK-FP16-NEXT:    moveq r6, r0
+; CHECK-FP16-NEXT:    movlo r0, r6
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    mov r0, #0
+; CHECK-FP16-NEXT:    movpl r6, r7
 ; CHECK-FP16-NEXT:    movpl r1, r9
+; CHECK-FP16-NEXT:    moveq r6, r0
 ; CHECK-FP16-NEXT:    cmn r1, #1
-; CHECK-FP16-NEXT:    movwgt r0, #1
-; CHECK-FP16-NEXT:    add r1, r1, #1
-; CHECK-FP16-NEXT:    clz r1, r1
-; CHECK-FP16-NEXT:    cmp r0, #0
 ; CHECK-FP16-NEXT:    mov r0, #-2147483648
-; CHECK-FP16-NEXT:    movne r0, r6
+; CHECK-FP16-NEXT:    movgt r0, r6
 ; CHECK-FP16-NEXT:    cmp r6, #-2147483648
 ; CHECK-FP16-NEXT:    movls r6, r8
-; CHECK-FP16-NEXT:    lsr r1, r1, #5
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    moveq r6, r0
+; CHECK-FP16-NEXT:    cmn r1, #1
+; CHECK-FP16-NEXT:    movne r6, r0
 ; CHECK-FP16-NEXT:    vmov s0, r2
 ; CHECK-FP16-NEXT:    bl __fixhfdi
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    mov r2, #0
-; CHECK-FP16-NEXT:    movwmi r2, #1
-; CHECK-FP16-NEXT:    cmp r2, #0
-; CHECK-FP16-NEXT:    mvn r2, #-2147483648
-; CHECK-FP16-NEXT:    vmov.32 d0[0], r6
-; CHECK-FP16-NEXT:    movne r2, r0
 ; CHECK-FP16-NEXT:    cmn r0, #-2147483647
-; CHECK-FP16-NEXT:    movlo r7, r0
-; CHECK-FP16-NEXT:    clz r0, r1
-; CHECK-FP16-NEXT:    vmov.32 d1[0], r5
-; CHECK-FP16-NEXT:    lsr r0, r0, #5
-; CHECK-FP16-NEXT:    cmp r0, #0
-; CHECK-FP16-NEXT:    moveq r7, r2
+; CHECK-FP16-NEXT:    mvn r2, #-2147483648
+; CHECK-FP16-NEXT:    movlo r2, r0
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    movpl r1, r9
-; CHECK-FP16-NEXT:    cmn r1, #1
-; CHECK-FP16-NEXT:    add r1, r1, #1
-; CHECK-FP16-NEXT:    movwgt r9, #1
-; CHECK-FP16-NEXT:    clz r1, r1
-; CHECK-FP16-NEXT:    cmp r9, #0
+; CHECK-FP16-NEXT:    movmi r7, r0
+; CHECK-FP16-NEXT:    movmi r9, r1
+; CHECK-FP16-NEXT:    moveq r7, r2
+; CHECK-FP16-NEXT:    cmn r9, #1
 ; CHECK-FP16-NEXT:    mov r0, #-2147483648
-; CHECK-FP16-NEXT:    vmov.32 d1[1], r4
-; CHECK-FP16-NEXT:    movne r0, r7
+; CHECK-FP16-NEXT:    vmov.32 d1[0], r6
+; CHECK-FP16-NEXT:    movgt r0, r7
 ; CHECK-FP16-NEXT:    cmp r7, #-2147483648
+; CHECK-FP16-NEXT:    vmov.32 d0[0], r5
 ; CHECK-FP16-NEXT:    movls r7, r8
-; CHECK-FP16-NEXT:    lsr r1, r1, #5
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    moveq r7, r0
+; CHECK-FP16-NEXT:    cmn r9, #1
+; CHECK-FP16-NEXT:    vmov.32 d1[1], r4
+; CHECK-FP16-NEXT:    movne r7, r0
 ; CHECK-FP16-NEXT:    vmov.32 d0[1], r7
 ; CHECK-FP16-NEXT:    vpop {d8, d9}
 ; CHECK-FP16-NEXT:    pop {r4, r5, r6, r7, r8, r9, r11, pc}
@@ -3247,233 +3011,157 @@
 define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEON-LABEL: ustest_f16i32_mm:
 ; CHECK-NEON:       @ %bb.0: @ %entry
-; CHECK-NEON-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEON-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEON-NEXT:    .pad #4
-; CHECK-NEON-NEXT:    sub sp, sp, #4
+; CHECK-NEON-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEON-NEXT:    push {r4, r5, r6, r7, r8, r9, r11, lr}
 ; CHECK-NEON-NEXT:    .vsave {d8, d9, d10}
 ; CHECK-NEON-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEON-NEXT:    vmov r0, s3
-; CHECK-NEON-NEXT:    vmov.f32 s16, s2
-; CHECK-NEON-NEXT:    vmov.f32 s18, s1
+; CHECK-NEON-NEXT:    vmov.f32 s18, s2
+; CHECK-NEON-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s20, s0
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
-; CHECK-NEON-NEXT:    mov r2, r0
-; CHECK-NEON-NEXT:    vmov r0, s18
+; CHECK-NEON-NEXT:    vmov r2, s20
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r3, #0
-; CHECK-NEON-NEXT:    movwmi r3, #1
-; CHECK-NEON-NEXT:    clz r6, r1
-; CHECK-NEON-NEXT:    cmp r3, #0
 ; CHECK-NEON-NEXT:    mvn r3, #0
-; CHECK-NEON-NEXT:    movne r3, r2
-; CHECK-NEON-NEXT:    lsr r6, r6, #5
-; CHECK-NEON-NEXT:    cmp r6, #0
-; CHECK-NEON-NEXT:    mov r7, #0
-; CHECK-NEON-NEXT:    movne r3, r2
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    movpl r1, r7
+; CHECK-NEON-NEXT:    mov r6, #0
+; CHECK-NEON-NEXT:    movmi r3, r0
+; CHECK-NEON-NEXT:    movpl r1, r6
+; CHECK-NEON-NEXT:    moveq r3, r0
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r11, #0
-; CHECK-NEON-NEXT:    clz r1, r1
-; CHECK-NEON-NEXT:    movwgt r11, #1
-; CHECK-NEON-NEXT:    cmp r11, #0
-; CHECK-NEON-NEXT:    movne r11, r3
-; CHECK-NEON-NEXT:    lsr r1, r1, #5
+; CHECK-NEON-NEXT:    mov r7, #0
+; CHECK-NEON-NEXT:    vmov r8, s18
+; CHECK-NEON-NEXT:    movwgt r7, #1
+; CHECK-NEON-NEXT:    cmp r7, #0
+; CHECK-NEON-NEXT:    movne r7, r3
 ; CHECK-NEON-NEXT:    cmp r1, #0
 ; CHECK-NEON-NEXT:    mvn r9, #0
-; CHECK-NEON-NEXT:    vmov r8, s20
-; CHECK-NEON-NEXT:    movne r11, r3
+; CHECK-NEON-NEXT:    moveq r7, r3
+; CHECK-NEON-NEXT:    mov r0, r2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
-; CHECK-NEON-NEXT:    mov r4, r1
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r1, #0
-; CHECK-NEON-NEXT:    mvn r10, #0
-; CHECK-NEON-NEXT:    movwmi r1, #1
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    clz r1, r4
-; CHECK-NEON-NEXT:    movne r10, r0
-; CHECK-NEON-NEXT:    mov r6, #0
-; CHECK-NEON-NEXT:    lsr r1, r1, #5
+; CHECK-NEON-NEXT:    mvn r2, #0
+; CHECK-NEON-NEXT:    movmi r2, r0
+; CHECK-NEON-NEXT:    movpl r1, r6
+; CHECK-NEON-NEXT:    moveq r2, r0
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    movne r10, r0
-; CHECK-NEON-NEXT:    cmp r4, #0
-; CHECK-NEON-NEXT:    movpl r4, r7
-; CHECK-NEON-NEXT:    cmp r4, #0
-; CHECK-NEON-NEXT:    movwgt r6, #1
-; CHECK-NEON-NEXT:    cmp r6, #0
+; CHECK-NEON-NEXT:    mov r4, #0
 ; CHECK-NEON-NEXT:    mov r0, r8
-; CHECK-NEON-NEXT:    movne r6, r10
+; CHECK-NEON-NEXT:    movwgt r4, #1
+; CHECK-NEON-NEXT:    cmp r4, #0
+; CHECK-NEON-NEXT:    movne r4, r2
+; CHECK-NEON-NEXT:    cmp r1, #0
+; CHECK-NEON-NEXT:    moveq r4, r2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r2, #0
-; CHECK-NEON-NEXT:    movwmi r2, #1
-; CHECK-NEON-NEXT:    clz r3, r1
-; CHECK-NEON-NEXT:    cmp r2, #0
 ; CHECK-NEON-NEXT:    mvn r2, #0
-; CHECK-NEON-NEXT:    movne r2, r0
-; CHECK-NEON-NEXT:    lsr r3, r3, #5
-; CHECK-NEON-NEXT:    cmp r3, #0
-; CHECK-NEON-NEXT:    mov r5, #0
-; CHECK-NEON-NEXT:    movne r2, r0
-; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    movpl r1, r7
-; CHECK-NEON-NEXT:    clz r0, r1
+; CHECK-NEON-NEXT:    movmi r2, r0
+; CHECK-NEON-NEXT:    movpl r1, r6
+; CHECK-NEON-NEXT:    moveq r2, r0
+; CHECK-NEON-NEXT:    vmov r0, s16
 ; CHECK-NEON-NEXT:    cmp r1, #0
+; CHECK-NEON-NEXT:    mov r5, #0
 ; CHECK-NEON-NEXT:    movwgt r5, #1
 ; CHECK-NEON-NEXT:    cmp r5, #0
-; CHECK-NEON-NEXT:    lsr r0, r0, #5
-; CHECK-NEON-NEXT:    movne r5, r2
-; CHECK-NEON-NEXT:    cmp r0, #0
-; CHECK-NEON-NEXT:    vmov r0, s16
 ; CHECK-NEON-NEXT:    movne r5, r2
+; CHECK-NEON-NEXT:    cmp r1, #0
+; CHECK-NEON-NEXT:    moveq r5, r2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    mov r2, #0
-; CHECK-NEON-NEXT:    movwmi r2, #1
-; CHECK-NEON-NEXT:    cmp r2, #0
-; CHECK-NEON-NEXT:    clz r2, r1
-; CHECK-NEON-NEXT:    movne r9, r0
-; CHECK-NEON-NEXT:    vmov.32 d0[0], r5
-; CHECK-NEON-NEXT:    lsr r2, r2, #5
-; CHECK-NEON-NEXT:    cmp r2, #0
-; CHECK-NEON-NEXT:    movne r9, r0
+; CHECK-NEON-NEXT:    vmov.32 d1[0], r5
+; CHECK-NEON-NEXT:    movmi r9, r0
+; CHECK-NEON-NEXT:    movpl r1, r6
+; CHECK-NEON-NEXT:    moveq r9, r0
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    movpl r1, r7
-; CHECK-NEON-NEXT:    clz r0, r1
+; CHECK-NEON-NEXT:    movwgt r6, #1
+; CHECK-NEON-NEXT:    cmp r6, #0
+; CHECK-NEON-NEXT:    vmov.32 d0[0], r4
+; CHECK-NEON-NEXT:    movne r6, r9
 ; CHECK-NEON-NEXT:    cmp r1, #0
-; CHECK-NEON-NEXT:    movwgt r7, #1
-; CHECK-NEON-NEXT:    cmp r7, #0
-; CHECK-NEON-NEXT:    movne r7, r9
-; CHECK-NEON-NEXT:    lsr r0, r0, #5
-; CHECK-NEON-NEXT:    cmp r0, #0
-; CHECK-NEON-NEXT:    clz r0, r4
-; CHECK-NEON-NEXT:    movne r7, r9
-; CHECK-NEON-NEXT:    vmov.32 d1[0], r7
-; CHECK-NEON-NEXT:    lsr r0, r0, #5
-; CHECK-NEON-NEXT:    cmp r0, #0
-; CHECK-NEON-NEXT:    movne r6, r10
-; CHECK-NEON-NEXT:    vmov.32 d1[1], r11
+; CHECK-NEON-NEXT:    vmov.32 d1[1], r7
+; CHECK-NEON-NEXT:    moveq r6, r9
 ; CHECK-NEON-NEXT:    vmov.32 d0[1], r6
 ; CHECK-NEON-NEXT:    vpop {d8, d9, d10}
-; CHECK-NEON-NEXT:    add sp, sp, #4
-; CHECK-NEON-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEON-NEXT:    pop {r4, r5, r6, r7, r8, r9, r11, pc}
 ;
 ; CHECK-FP16-LABEL: ustest_f16i32_mm:
 ; CHECK-FP16:       @ %bb.0: @ %entry
-; CHECK-FP16-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-FP16-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-FP16-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-FP16-NEXT:    push {r4, r5, r6, r7, r8, lr}
 ; CHECK-FP16-NEXT:    .vsave {d8, d9}
 ; CHECK-FP16-NEXT:    vpush {d8, d9}
 ; CHECK-FP16-NEXT:    vmov.u16 r0, d0[3]
 ; CHECK-FP16-NEXT:    vorr d8, d0, d0
-; CHECK-FP16-NEXT:    vmov.u16 r4, d0[1]
 ; CHECK-FP16-NEXT:    vmov s0, r0
 ; CHECK-FP16-NEXT:    bl __fixhfdi
-; CHECK-FP16-NEXT:    vmov.u16 r2, d8[0]
+; CHECK-FP16-NEXT:    vmov.u16 r2, d8[1]
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    clz r3, r1
+; CHECK-FP16-NEXT:    vmov.u16 r7, d8[0]
+; CHECK-FP16-NEXT:    mov r5, #0
+; CHECK-FP16-NEXT:    vmov.u16 r3, d8[2]
+; CHECK-FP16-NEXT:    movpl r1, r5
 ; CHECK-FP16-NEXT:    mov r6, #0
-; CHECK-FP16-NEXT:    mov r10, #0
-; CHECK-FP16-NEXT:    vmov s0, r4
-; CHECK-FP16-NEXT:    lsr r3, r3, #5
 ; CHECK-FP16-NEXT:    mvn r8, #0
-; CHECK-FP16-NEXT:    vmov s18, r2
-; CHECK-FP16-NEXT:    mov r2, #0
-; CHECK-FP16-NEXT:    movwmi r2, #1
-; CHECK-FP16-NEXT:    cmp r2, #0
+; CHECK-FP16-NEXT:    vmov s16, r2
 ; CHECK-FP16-NEXT:    mvn r2, #0
-; CHECK-FP16-NEXT:    movne r2, r0
-; CHECK-FP16-NEXT:    cmp r3, #0
-; CHECK-FP16-NEXT:    movne r2, r0
+; CHECK-FP16-NEXT:    movmi r2, r0
+; CHECK-FP16-NEXT:    vmov s0, r7
+; CHECK-FP16-NEXT:    moveq r2, r0
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    movpl r1, r6
-; CHECK-FP16-NEXT:    clz r0, r1
+; CHECK-FP16-NEXT:    movwgt r6, #1
+; CHECK-FP16-NEXT:    cmp r6, #0
+; CHECK-FP16-NEXT:    movne r6, r2
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    movwgt r10, #1
-; CHECK-FP16-NEXT:    cmp r10, #0
-; CHECK-FP16-NEXT:    movne r10, r2
-; CHECK-FP16-NEXT:    lsr r0, r0, #5
-; CHECK-FP16-NEXT:    cmp r0, #0
-; CHECK-FP16-NEXT:    movne r10, r2
+; CHECK-FP16-NEXT:    vmov s18, r3
+; CHECK-FP16-NEXT:    moveq r6, r2
 ; CHECK-FP16-NEXT:    bl __fixhfdi
-; CHECK-FP16-NEXT:    mov r4, r1
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    mov r1, #0
 ; CHECK-FP16-NEXT:    vmov.f32 s0, s18
-; CHECK-FP16-NEXT:    movwmi r1, #1
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    clz r1, r4
-; CHECK-FP16-NEXT:    mvn r9, #0
-; CHECK-FP16-NEXT:    movne r9, r0
-; CHECK-FP16-NEXT:    mov r5, #0
-; CHECK-FP16-NEXT:    lsr r1, r1, #5
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    movne r9, r0
-; CHECK-FP16-NEXT:    cmp r4, #0
-; CHECK-FP16-NEXT:    movpl r4, r6
-; CHECK-FP16-NEXT:    cmp r4, #0
-; CHECK-FP16-NEXT:    movwgt r5, #1
-; CHECK-FP16-NEXT:    cmp r5, #0
-; CHECK-FP16-NEXT:    movne r5, r9
-; CHECK-FP16-NEXT:    bl __fixhfdi
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    mov r2, #0
-; CHECK-FP16-NEXT:    movwmi r2, #1
-; CHECK-FP16-NEXT:    clz r3, r1
-; CHECK-FP16-NEXT:    cmp r2, #0
 ; CHECK-FP16-NEXT:    mvn r2, #0
-; CHECK-FP16-NEXT:    movne r2, r0
-; CHECK-FP16-NEXT:    lsr r3, r3, #5
-; CHECK-FP16-NEXT:    cmp r3, #0
+; CHECK-FP16-NEXT:    movpl r1, r5
+; CHECK-FP16-NEXT:    movmi r2, r0
 ; CHECK-FP16-NEXT:    mov r7, #0
-; CHECK-FP16-NEXT:    movne r2, r0
+; CHECK-FP16-NEXT:    moveq r2, r0
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    movpl r1, r6
-; CHECK-FP16-NEXT:    clz r0, r1
-; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    vmov.u16 r1, d8[2]
 ; CHECK-FP16-NEXT:    movwgt r7, #1
 ; CHECK-FP16-NEXT:    cmp r7, #0
 ; CHECK-FP16-NEXT:    movne r7, r2
-; CHECK-FP16-NEXT:    lsr r0, r0, #5
-; CHECK-FP16-NEXT:    cmp r0, #0
-; CHECK-FP16-NEXT:    movne r7, r2
-; CHECK-FP16-NEXT:    vmov s0, r1
+; CHECK-FP16-NEXT:    cmp r1, #0
+; CHECK-FP16-NEXT:    moveq r7, r2
 ; CHECK-FP16-NEXT:    bl __fixhfdi
+; CHECK-FP16-NEXT:    vmov.f32 s0, s16
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    mov r2, #0
-; CHECK-FP16-NEXT:    movwmi r2, #1
-; CHECK-FP16-NEXT:    cmp r2, #0
-; CHECK-FP16-NEXT:    clz r2, r1
-; CHECK-FP16-NEXT:    movne r8, r0
-; CHECK-FP16-NEXT:    vmov.32 d0[0], r7
-; CHECK-FP16-NEXT:    lsr r2, r2, #5
-; CHECK-FP16-NEXT:    cmp r2, #0
-; CHECK-FP16-NEXT:    movne r8, r0
+; CHECK-FP16-NEXT:    mvn r2, #0
+; CHECK-FP16-NEXT:    movpl r1, r5
+; CHECK-FP16-NEXT:    movmi r2, r0
+; CHECK-FP16-NEXT:    mov r4, #0
+; CHECK-FP16-NEXT:    moveq r2, r0
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    movpl r1, r6
-; CHECK-FP16-NEXT:    clz r0, r1
+; CHECK-FP16-NEXT:    movwgt r4, #1
+; CHECK-FP16-NEXT:    cmp r4, #0
+; CHECK-FP16-NEXT:    movne r4, r2
 ; CHECK-FP16-NEXT:    cmp r1, #0
-; CHECK-FP16-NEXT:    movwgt r6, #1
-; CHECK-FP16-NEXT:    cmp r6, #0
-; CHECK-FP16-NEXT:    movne r6, r8
-; CHECK-FP16-NEXT:    lsr r0, r0, #5
-; CHECK-FP16-NEXT:    cmp r0, #0
-; CHECK-FP16-NEXT:    clz r0, r4
-; CHECK-FP16-NEXT:    movne r6, r8
-; CHECK-FP16-NEXT:    vmov.32 d1[0], r6
-; CHECK-FP16-NEXT:    lsr r0, r0, #5
-; CHECK-FP16-NEXT:    cmp r0, #0
-; CHECK-FP16-NEXT:    movne r5, r9
-; CHECK-FP16-NEXT:    vmov.32 d1[1], r10
+; CHECK-FP16-NEXT:    moveq r4, r2
+; CHECK-FP16-NEXT:    bl __fixhfdi
+; CHECK-FP16-NEXT:    cmp r1, #0
+; CHECK-FP16-NEXT:    vmov.32 d1[0], r4
+; CHECK-FP16-NEXT:    movmi r8, r0
+; CHECK-FP16-NEXT:    movpl r1, r5
+; CHECK-FP16-NEXT:    moveq r8, r0
+; CHECK-FP16-NEXT:    cmp r1, #0
+; CHECK-FP16-NEXT:    movwgt r5, #1
+; CHECK-FP16-NEXT:    cmp r5, #0
+; CHECK-FP16-NEXT:    vmov.32 d0[0], r7
+; CHECK-FP16-NEXT:    movne r5, r8
+; CHECK-FP16-NEXT:    cmp r1, #0
+; CHECK-FP16-NEXT:    vmov.32 d1[1], r6
+; CHECK-FP16-NEXT:    moveq r5, r8
 ; CHECK-FP16-NEXT:    vmov.32 d0[1], r5
 ; CHECK-FP16-NEXT:    vpop {d8, d9}
-; CHECK-FP16-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-FP16-NEXT:    pop {r4, r5, r6, r7, r8, pc}
 entry:
   %conv = fptosi <4 x half> %x to <4 x i64>
   %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/build-vector-float-type.ll b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-float-type.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-float-type.ll
@@ -0,0 +1,504 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that this code does compile.
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+; CHECK-LABEL: f0:
+; CHECK: vinsert
+define <32 x float> @f0(i32* %a0, float* %a1) #0 {
+b0:
+  %v0 = getelementptr i32, i32* %a0, i32 0
+  %v1 = load i32, i32* %v0, align 4
+  %v2 = getelementptr float, float* %a1, i32 %v1
+  %v3 = load float, float* %v2, align 4
+  %v4 = insertelement <32 x float> undef, float %v3, i32 0
+  %v5 = getelementptr i32, i32* %a0, i32 1
+  %v6 = load i32, i32* %v5, align 4
+  %v7 = getelementptr float, float* %a1, i32 %v6
+  %v8 = load float, float* %v7, align 4
+  %v9 = insertelement <32 x float> %v4, float %v8, i32 1
+  %v10 = getelementptr i32, i32* %a0, i32 2
+  %v11 = load i32, i32* %v10, align 4
+  %v12 = getelementptr float, float* %a1, i32 %v11
+  %v13 = load float, float* %v12, align 4
+  %v14 = insertelement <32 x float> %v9, float %v13, i32 2
+  %v15 = getelementptr i32, i32* %a0, i32 3
+  %v16 = load i32, i32* %v15, align 4
+  %v17 = getelementptr float, float* %a1, i32 %v16
+  %v18 = load float, float* %v17, align 4
+  %v19 = insertelement <32 x float> %v14, float %v18, i32 3
+  %v20 = getelementptr i32, i32* %a0, i32 4
+  %v21 = load i32, i32* %v20, align 4
+  %v22 = getelementptr float, float* %a1, i32 %v21
+  %v23 = load float, float* %v22, align 4
+  %v24 = insertelement <32 x float> %v19, float %v23, i32 4
+  %v25 = getelementptr i32, i32* %a0, i32 5
+  %v26 = load i32, i32* %v25, align 4
+  %v27 = getelementptr float, float* %a1, i32 %v26
+  %v28 = load float, float* %v27, align 4
+  %v29 = insertelement <32 x float> %v24, float %v28, i32 5
+  %v30 = getelementptr i32, i32* %a0, i32 6
+  %v31 = load i32, i32* %v30, align 4
+  %v32 = getelementptr float, float* %a1, i32 %v31
+  %v33 = load float, float* %v32, align 4
+  %v34 = insertelement <32 x float> %v29, float %v33, i32 6
+  %v35 = getelementptr i32, i32* %a0, i32 7
+  %v36 = load i32, i32* %v35, align 4
+  %v37 = getelementptr float, float* %a1, i32 %v36
+  %v38 = load float, float* %v37, align 4
+  %v39 = insertelement <32 x float> %v34, float %v38, i32 7
+  %v40 = getelementptr i32, i32* %a0, i32 8
+  %v41 = load i32, i32* %v40, align 4
+  %v42 = getelementptr float, float* %a1, i32 %v41
+  %v43 = load float, float* %v42, align 4
+  %v44 = insertelement <32 x float> %v39, float %v43, i32 8
+  %v45 = getelementptr i32, i32* %a0, i32 9
+  %v46 = load i32, i32* %v45, align 4
+  %v47 = getelementptr float, float* %a1, i32 %v46
+  %v48 = load float, float* %v47, align 4
+  %v49 = insertelement <32 x float> %v44, float %v48, i32 9
+  %v50 = getelementptr i32, i32* %a0, i32 10
+  %v51 = load i32, i32* %v50, align 4
+  %v52 = getelementptr float, float* %a1, i32 %v51
+  %v53 = load float, float* %v52, align 4
+  %v54 = insertelement <32 x float> %v49, float %v53, i32 10
+  %v55 = getelementptr i32, i32* %a0, i32 11
+  %v56 = load i32, i32* %v55, align 4
+  %v57 = getelementptr float, float* %a1, i32 %v56
+  %v58 = load float, float* %v57, align 4
+  %v59 = insertelement <32 x float> %v54, float %v58, i32 11
+  %v60 = getelementptr i32, i32* %a0, i32 12
+  %v61 = load i32, i32* %v60, align 4
+  %v62 = getelementptr float, float* %a1, i32 %v61
+  %v63 = load float, float* %v62, align 4
+  %v64 = insertelement <32 x float> %v59, float %v63, i32 12
+  %v65 = getelementptr i32, i32* %a0, i32 13
+  %v66 = load i32, i32* %v65, align 4
+  %v67 = getelementptr float, float* %a1, i32 %v66
+  %v68 = load float, float* %v67, align 4
+  %v69 = insertelement <32 x float> %v64, float %v68, i32 13
+  %v70 = getelementptr i32, i32* %a0, i32 14
+  %v71 = load i32, i32* %v70, align 4
+  %v72 = getelementptr float, float* %a1, i32 %v71
+  %v73 = load float, float* %v72, align 4
+  %v74 = insertelement <32 x float> %v69, float %v73, i32 14
+  %v75 = getelementptr i32, i32* %a0, i32 15
+  %v76 = load i32, i32* %v75, align 4
+  %v77 = getelementptr float, float* %a1, i32 %v76
+  %v78 = load float, float* %v77, align 4
+  %v79 = insertelement <32 x float> %v74, float %v78, i32 15
+  %v80 = getelementptr i32, i32* %a0, i32 16
+  %v81 = load i32, i32* %v80, align 4
+  %v82 = getelementptr float, float* %a1, i32 %v81
+  %v83 = load float, float* %v82, align 4
+  %v84 = insertelement <32 x float> %v79, float %v83, i32 16
+  %v85 = getelementptr i32, i32* %a0, i32 17
+  %v86 = load i32, i32* %v85, align 4
+  %v87 = getelementptr float, float* %a1, i32 %v86
+  %v88 = load float, float* %v87, align 4
+  %v89 = insertelement <32 x float> %v84, float %v88, i32 17
+  %v90 = getelementptr i32, i32* %a0, i32 18
+  %v91 = load i32, i32* %v90, align 4
+  %v92 = getelementptr float, float* %a1, i32 %v91
+  %v93 = load float, float* %v92, align 4
+  %v94 = insertelement <32 x float> %v89, float %v93, i32 18
+  %v95 = getelementptr i32, i32* %a0, i32 19
+  %v96 = load i32, i32* %v95, align 4
+  %v97 = getelementptr float, float* %a1, i32 %v96
+  %v98 = load float, float* %v97, align 4
+  %v99 = insertelement <32 x float> %v94, float %v98, i32 19
+  %v100 = getelementptr i32, i32* %a0, i32 20
+  %v101 = load i32, i32* %v100, align 4
+  %v102 = getelementptr float, float* %a1, i32 %v101
+  %v103 = load float, float* %v102, align 4
+  %v104 = insertelement <32 x float> %v99, float %v103, i32 20
+  %v105 = getelementptr i32, i32* %a0, i32 21
+  %v106 = load i32, i32* %v105, align 4
+  %v107 = getelementptr float, float* %a1, i32 %v106
+  %v108 = load float, float* %v107, align 4
+  %v109 = insertelement <32 x float> %v104, float %v108, i32 21
+  %v110 = getelementptr i32, i32* %a0, i32 22
+  %v111 = load i32, i32* %v110, align 4
+  %v112 = getelementptr float, float* %a1, i32 %v111
+  %v113 = load float, float* %v112, align 4
+  %v114 = insertelement <32 x float> %v109, float %v113, i32 22
+  %v115 = getelementptr i32, i32* %a0, i32 23
+  %v116 = load i32, i32* %v115, align 4
+  %v117 = getelementptr float, float* %a1, i32 %v116
+  %v118 = load float, float* %v117, align 4
+  %v119 = insertelement <32 x float> %v114, float %v118, i32 23
+  %v120 = getelementptr i32, i32* %a0, i32 24
+  %v121 = load i32, i32* %v120, align 4
+  %v122 = getelementptr float, float* %a1, i32 %v121
+  %v123 = load float, float* %v122, align 4
+  %v124 = insertelement <32 x float> %v119, float %v123, i32 24
+  %v125 = getelementptr i32, i32* %a0, i32 25
+  %v126 = load i32, i32* %v125, align 4
+  %v127 = getelementptr float, float* %a1, i32 %v126
+  %v128 = load float, float* %v127, align 4
+  %v129 = insertelement <32 x float> %v124, float %v128, i32 25
+  %v130 = getelementptr i32, i32* %a0, i32 26
+  %v131 = load i32, i32* %v130, align 4
+  %v132 = getelementptr float, float* %a1, i32 %v131
+  %v133 = load float, float* %v132, align 4
+  %v134 = insertelement <32 x float> %v129, float %v133, i32 26
+  %v135 = getelementptr i32, i32* %a0, i32 27
+  %v136 = load i32, i32* %v135, align 4
+  %v137 = getelementptr float, float* %a1, i32 %v136
+  %v138 = load float, float* %v137, align 4
+  %v139 = insertelement <32 x float> %v134, float %v138, i32 27
+  %v140 = getelementptr i32, i32* %a0, i32 28
+  %v141 = load i32, i32* %v140, align 4
+  %v142 = getelementptr float, float* %a1, i32 %v141
+  %v143 = load float, float* %v142, align 4
+  %v144 = insertelement <32 x float> %v139, float %v143, i32 28
+  %v145 = getelementptr i32, i32* %a0, i32 29
+  %v146 = load i32, i32* %v145, align 4
+  %v147 = getelementptr float, float* %a1, i32 %v146
+  %v148 = load float, float* %v147, align 4
+  %v149 = insertelement <32 x float> %v144, float %v148, i32 29
+  %v150 = getelementptr i32, i32* %a0, i32 30
+  %v151 = load i32, i32* %v150, align 4
+  %v152 = getelementptr float, float* %a1, i32 %v151
+  %v153 = load float, float* %v152, align 4
+  %v154 = insertelement <32 x float> %v149, float %v153, i32 30
+  %v155 = getelementptr i32, i32* %a0, i32 31
+  %v156 = load i32, i32* %v155, align 4
+  %v157 = getelementptr float, float* %a1, i32 %v156
+  %v158 = load float, float* %v157, align 4
+  %v159 = insertelement <32 x float> %v154, float %v158, i32 31
+  ret <32 x float> %v159
+}
+
+; Function Attrs: nounwind
+; CHECK-LABEL: f1:
+; CHECK: vinsert
+define <64 x half> @f1(i32* %a0, half* %a1) #0 {
+b0:
+  %v0 = getelementptr i32, i32* %a0, i32 0
+  %v1 = load i32, i32* %v0, align 4
+  %v2 = getelementptr half, half* %a1, i32 %v1
+  %v3 = load half, half* %v2, align 4
+  %v4 = insertelement <64 x half> undef, half %v3, i32 0
+  %v5 = getelementptr i32, i32* %a0, i32 1
+  %v6 = load i32, i32* %v5, align 4
+  %v7 = getelementptr half, half* %a1, i32 %v6
+  %v8 = load half, half* %v7, align 4
+  %v9 = insertelement <64 x half> %v4, half %v8, i32 1
+  %v10 = getelementptr i32, i32* %a0, i32 2
+  %v11 = load i32, i32* %v10, align 4
+  %v12 = getelementptr half, half* %a1, i32 %v11
+  %v13 = load half, half* %v12, align 4
+  %v14 = insertelement <64 x half> %v9, half %v13, i32 2
+  %v15 = getelementptr i32, i32* %a0, i32 3
+  %v16 = load i32, i32* %v15, align 4
+  %v17 = getelementptr half, half* %a1, i32 %v16
+  %v18 = load half, half* %v17, align 4
+  %v19 = insertelement <64 x half> %v14, half %v18, i32 3
+  %v20 = getelementptr i32, i32* %a0, i32 4
+  %v21 = load i32, i32* %v20, align 4
+  %v22 = getelementptr half, half* %a1, i32 %v21
+  %v23 = load half, half* %v22, align 4
+  %v24 = insertelement <64 x half> %v19, half %v23, i32 4
+  %v25 = getelementptr i32, i32* %a0, i32 5
+  %v26 = load i32, i32* %v25, align 4
+  %v27 = getelementptr half, half* %a1, i32 %v26
+  %v28 = load half, half* %v27, align 4
+  %v29 = insertelement <64 x half> %v24, half %v28, i32 5
+  %v30 = getelementptr i32, i32* %a0, i32 6
+  %v31 = load i32, i32* %v30, align 4
+  %v32 = getelementptr half, half* %a1, i32 %v31
+  %v33 = load half, half* %v32, align 4
+  %v34 = insertelement <64 x half> %v29, half %v33, i32 6
+  %v35 = getelementptr i32, i32* %a0, i32 7
+  %v36 = load i32, i32* %v35, align 4
+  %v37 = getelementptr half, half* %a1, i32 %v36
+  %v38 = load half, half* %v37, align 4
+  %v39 = insertelement <64 x half> %v34, half %v38, i32 7
+  %v40 = getelementptr i32, i32* %a0, i32 8
+  %v41 = load i32, i32* %v40, align 4
+  %v42 = getelementptr half, half* %a1, i32 %v41
+  %v43 = load half, half* %v42, align 4
+  %v44 = insertelement <64 x half> %v39, half %v43, i32 8
+  %v45 = getelementptr i32, i32* %a0, i32 9
+  %v46 = load i32, i32* %v45, align 4
+  %v47 = getelementptr half, half* %a1, i32 %v46
+  %v48 = load half, half* %v47, align 4
+  %v49 = insertelement <64 x half> %v44, half %v48, i32 9
+  %v50 = getelementptr i32, i32* %a0, i32 10
+  %v51 = load i32, i32* %v50, align 4
+  %v52 = getelementptr half, half* %a1, i32 %v51
+  %v53 = load half, half* %v52, align 4
+  %v54 = insertelement <64 x half> %v49, half %v53, i32 10
+  %v55 = getelementptr i32, i32* %a0, i32 11
+  %v56 = load i32, i32* %v55, align 4
+  %v57 = getelementptr half, half* %a1, i32 %v56
+  %v58 = load half, half* %v57, align 4
+  %v59 = insertelement <64 x half> %v54, half %v58, i32 11
+  %v60 = getelementptr i32, i32* %a0, i32 12
+  %v61 = load i32, i32* %v60, align 4
+  %v62 = getelementptr half, half* %a1, i32 %v61
+  %v63 = load half, half* %v62, align 4
+  %v64 = insertelement <64 x half> %v59, half %v63, i32 12
+  %v65 = getelementptr i32, i32* %a0, i32 13
+  %v66 = load i32, i32* %v65, align 4
+  %v67 = getelementptr half, half* %a1, i32 %v66
+  %v68 = load half, half* %v67, align 4
+  %v69 = insertelement <64 x half> %v64, half %v68, i32 13
+  %v70 = getelementptr i32, i32* %a0, i32 14
+  %v71 = load i32, i32* %v70, align 4
+  %v72 = getelementptr half, half* %a1, i32 %v71
+  %v73 = load half, half* %v72, align 4
+  %v74 = insertelement <64 x half> %v69, half %v73, i32 14
+  %v75 = getelementptr i32, i32* %a0, i32 15
+  %v76 = load i32, i32* %v75, align 4
+  %v77 = getelementptr half, half* %a1, i32 %v76
+  %v78 = load half, half* %v77, align 4
+  %v79 = insertelement <64 x half> %v74, half %v78, i32 15
+  %v80 = getelementptr i32, i32* %a0, i32 16
+  %v81 = load i32, i32* %v80, align 4
+  %v82 = getelementptr half, half* %a1, i32 %v81
+  %v83 = load half, half* %v82, align 4
+  %v84 = insertelement <64 x half> %v79, half %v83, i32 16
+  %v85 = getelementptr i32, i32* %a0, i32 17
+  %v86 = load i32, i32* %v85, align 4
+  %v87 = getelementptr half, half* %a1, i32 %v86
+  %v88 = load half, half* %v87, align 4
+  %v89 = insertelement <64 x half> %v84, half %v88, i32 17
+  %v90 = getelementptr i32, i32* %a0, i32 18
+  %v91 = load i32, i32* %v90, align 4
+  %v92 = getelementptr half, half* %a1, i32 %v91
+  %v93 = load half, half* %v92, align 4
+  %v94 = insertelement <64 x half> %v89, half %v93, i32 18
+  %v95 = getelementptr i32, i32* %a0, i32 19
+  %v96 = load i32, i32* %v95, align 4
+  %v97 = getelementptr half, half* %a1, i32 %v96
+  %v98 = load half, half* %v97, align 4
+  %v99 = insertelement <64 x half> %v94, half %v98, i32 19
+  %v100 = getelementptr i32, i32* %a0, i32 20
+  %v101 = load i32, i32* %v100, align 4
+  %v102 = getelementptr half, half* %a1, i32 %v101
+  %v103 = load half, half* %v102, align 4
+  %v104 = insertelement <64 x half> %v99, half %v103, i32 20
+  %v105 = getelementptr i32, i32* %a0, i32 21
+  %v106 = load i32, i32* %v105, align 4
+  %v107 = getelementptr half, half* %a1, i32 %v106
+  %v108 = load half, half* %v107, align 4
+  %v109 = insertelement <64 x half> %v104, half %v108, i32 21
+  %v110 = getelementptr i32, i32* %a0, i32 22
+  %v111 = load i32, i32* %v110, align 4
+  %v112 = getelementptr half, half* %a1, i32 %v111
+  %v113 = load half, half* %v112, align 4
+  %v114 = insertelement <64 x half> %v109, half %v113, i32 22
+  %v115 = getelementptr i32, i32* %a0, i32 23
+  %v116 = load i32, i32* %v115, align 4
+  %v117 = getelementptr half, half* %a1, i32 %v116
+  %v118 = load half, half* %v117, align 4
+  %v119 = insertelement <64 x half> %v114, half %v118, i32 23
+  %v120 = getelementptr i32, i32* %a0, i32 24
+  %v121 = load i32, i32* %v120, align 4
+  %v122 = getelementptr half, half* %a1, i32 %v121
+  %v123 = load half, half* %v122, align 4
+  %v124 = insertelement <64 x half> %v119, half %v123, i32 24
+  %v125 = getelementptr i32, i32* %a0, i32 25
+  %v126 = load i32, i32* %v125, align 4
+  %v127 = getelementptr half, half* %a1, i32 %v126
+  %v128 = load half, half* %v127, align 4
+  %v129 = insertelement <64 x half> %v124, half %v128, i32 25
+  %v130 = getelementptr i32, i32* %a0, i32 26
+  %v131 = load i32, i32* %v130, align 4
+  %v132 = getelementptr half, half* %a1, i32 %v131
+  %v133 = load half, half* %v132, align 4
+  %v134 = insertelement <64 x half> %v129, half %v133, i32 26
+  %v135 = getelementptr i32, i32* %a0, i32 27
+  %v136 = load i32, i32* %v135, align 4
+  %v137 = getelementptr half, half* %a1, i32 %v136
+  %v138 = load half, half* %v137, align 4
+  %v139 = insertelement <64 x half> %v134, half %v138, i32 27
+  %v140 = getelementptr i32, i32* %a0, i32 28
+  %v141 = load i32, i32* %v140, align 4
+  %v142 = getelementptr half, half* %a1, i32 %v141
+  %v143 = load half, half* %v142, align 4
+  %v144 = insertelement <64 x half> %v139, half %v143, i32 28
+  %v145 = getelementptr i32, i32* %a0, i32 29
+  %v146 = load i32, i32* %v145, align 4
+  %v147 = getelementptr half, half* %a1, i32 %v146
+  %v148 = load half, half* %v147, align 4
+  %v149 = insertelement <64 x half> %v144, half %v148, i32 29
+  %v150 = getelementptr i32, i32* %a0, i32 30
+  %v151 = load i32, i32* %v150, align 4
+  %v152 = getelementptr half, half* %a1, i32 %v151
+  %v153 = load half, half* %v152, align 4
+  %v154 = insertelement <64 x half> %v149, half %v153, i32 30
+  %v155 = getelementptr i32, i32* %a0, i32 31
+  %v156 = load i32, i32* %v155, align 4
+  %v157 = getelementptr half, half* %a1, i32 %v156
+  %v158 = load half, half* %v157, align 4
+  %v159 = insertelement <64 x half> %v154, half %v158, i32 31
+  %v160 = getelementptr i32, i32* %a0, i32 32
+  %v161 = load i32, i32* %v160, align 4
+  %v162 = getelementptr half, half* %a1, i32 %v161
+  %v163 = load half, half* %v162, align 4
+  %v164 = insertelement <64 x half> %v159, half %v163, i32 32
+  %v165 = getelementptr i32, i32* %a0, i32 33
+  %v166 = load i32, i32* %v165, align 4
+  %v167 = getelementptr half, half* %a1, i32 %v166
+  %v168 = load half, half* %v167, align 4
+  %v169 = insertelement <64 x half> %v164, half %v168, i32 33
+  %v170 = getelementptr i32, i32* %a0, i32 34
+  %v171 = load i32, i32* %v170, align 4
+  %v172 = getelementptr half, half* %a1, i32 %v171
+  %v173 = load half, half* %v172, align 4
+  %v174 = insertelement <64 x half> %v169, half %v173, i32 34
+  %v175 = getelementptr i32, i32* %a0, i32 35
+  %v176 = load i32, i32* %v175, align 4
+  %v177 = getelementptr half, half* %a1, i32 %v176
+  %v178 = load half, half* %v177, align 4
+  %v179 = insertelement <64 x half> %v174, half %v178, i32 35
+  %v180 = getelementptr i32, i32* %a0, i32 36
+  %v181 = load i32, i32* %v180, align 4
+  %v182 = getelementptr half, half* %a1, i32 %v181
+  %v183 = load half, half* %v182, align 4
+  %v184 = insertelement <64 x half> %v179, half %v183, i32 36
+  %v185 = getelementptr i32, i32* %a0, i32 37
+  %v186 = load i32, i32* %v185, align 4
+  %v187 = getelementptr half, half* %a1, i32 %v186
+  %v188 = load half, half* %v187, align 4
+  %v189 = insertelement <64 x half> %v184, half %v188, i32 37
+  %v190 = getelementptr i32, i32* %a0, i32 38
+  %v191 = load i32, i32* %v190, align 4
+  %v192 = getelementptr half, half* %a1, i32 %v191
+  %v193 = load half, half* %v192, align 4
+  %v194 = insertelement <64 x half> %v189, half %v193, i32 38
+  %v195 = getelementptr i32, i32* %a0, i32 39
+  %v196 = load i32, i32* %v195, align 4
+  %v197 = getelementptr half, half* %a1, i32 %v196
+  %v198 = load half, half* %v197, align 4
+  %v199 = insertelement <64 x half> %v194, half %v198, i32 39
+  %v200 = getelementptr i32, i32* %a0, i32 40
+  %v201 = load i32, i32* %v200, align 4
+  %v202 = getelementptr half, half* %a1, i32 %v201
+  %v203 = load half, half* %v202, align 4
+  %v204 = insertelement <64 x half> %v199, half %v203, i32 40
+  %v205 = getelementptr i32, i32* %a0, i32 41
+  %v206 = load i32, i32* %v205, align 4
+  %v207 = getelementptr half, half* %a1, i32 %v206
+  %v208 = load half, half* %v207, align 4
+  %v209 = insertelement <64 x half> %v204, half %v208, i32 41
+  %v210 = getelementptr i32, i32* %a0, i32 42
+  %v211 = load i32, i32* %v210, align 4
+  %v212 = getelementptr half, half* %a1, i32 %v211
+  %v213 = load half, half* %v212, align 4
+  %v214 = insertelement <64 x half> %v209, half %v213, i32 42
+  %v215 = getelementptr i32, i32* %a0, i32 43
+  %v216 = load i32, i32* %v215, align 4
+  %v217 = getelementptr half, half* %a1, i32 %v216
+  %v218 = load half, half* %v217, align 4
+  %v219 = insertelement <64 x half> %v214, half %v218, i32 43
+  %v220 = getelementptr i32, i32* %a0, i32 44
+  %v221 = load i32, i32* %v220, align 4
+  %v222 = getelementptr half, half* %a1, i32 %v221
+  %v223 = load half, half* %v222, align 4
+  %v224 = insertelement <64 x half> %v219, half %v223, i32 44
+  %v225 = getelementptr i32, i32* %a0, i32 45
+  %v226 = load i32, i32* %v225, align 4
+  %v227 = getelementptr half, half* %a1, i32 %v226
+  %v228 = load half, half* %v227, align 4
+  %v229 = insertelement <64 x half> %v224, half %v228, i32 45
+  %v230 = getelementptr i32, i32* %a0, i32 46
+  %v231 = load i32, i32* %v230, align 4
+  %v232 = getelementptr half, half* %a1, i32 %v231
+  %v233 = load half, half* %v232, align 4
+  %v234 = insertelement <64 x half> %v229, half %v233, i32 46
+  %v235 = getelementptr i32, i32* %a0, i32 47
+  %v236 = load i32, i32* %v235, align 4
+  %v237 = getelementptr half, half* %a1, i32 %v236
+  %v238 = load half, half* %v237, align 4
+  %v239 = insertelement <64 x half> %v234, half %v238, i32 47
+  %v240 = getelementptr i32, i32* %a0, i32 48
+  %v241 = load i32, i32* %v240, align 4
+  %v242 = getelementptr half, half* %a1, i32 %v241
+  %v243 = load half, half* %v242, align 4
+  %v244 = insertelement <64 x half> %v239, half %v243, i32 48
+  %v245 = getelementptr i32, i32* %a0, i32 49
+  %v246 = load i32, i32* %v245, align 4
+  %v247 = getelementptr half, half* %a1, i32 %v246
+  %v248 = load half, half* %v247, align 4
+  %v249 = insertelement <64 x half> %v244, half %v248, i32 49
+  %v250 = getelementptr i32, i32* %a0, i32 50
+  %v251 = load i32, i32* %v250, align 4
+  %v252 = getelementptr half, half* %a1, i32 %v251
+  %v253 = load half, half* %v252, align 4
+  %v254 = insertelement <64 x half> %v249, half %v253, i32 50
+  %v255 = getelementptr i32, i32* %a0, i32 51
+  %v256 = load i32, i32* %v255, align 4
+  %v257 = getelementptr half, half* %a1, i32 %v256
+  %v258 = load half, half* %v257, align 4
+  %v259 = insertelement <64 x half> %v254, half %v258, i32 51
+  %v260 = getelementptr i32, i32* %a0, i32 52
+  %v261 = load i32, i32* %v260, align 4
+  %v262 = getelementptr half, half* %a1, i32 %v261
+  %v263 = load half, half* %v262, align 4
+  %v264 = insertelement <64 x half> %v259, half %v263, i32 52
+  %v265 = getelementptr i32, i32* %a0, i32 53
+  %v266 = load i32, i32* %v265, align 4
+  %v267 = getelementptr half, half* %a1, i32 %v266
+  %v268 = load half, half* %v267, align 4
+  %v269 = insertelement <64 x half> %v264, half %v268, i32 53
+  %v270 = getelementptr i32, i32* %a0, i32 54
+  %v271 = load i32, i32* %v270, align 4
+  %v272 = getelementptr half, half* %a1, i32 %v271
+  %v273 = load half, half* %v272, align 4
+  %v274 = insertelement <64 x half> %v269, half %v273, i32 54
+  %v275 = getelementptr i32, i32* %a0, i32 55
+  %v276 = load i32, i32* %v275, align 4
+  %v277 = getelementptr half, half* %a1, i32 %v276
+  %v278 = load half, half* %v277, align 4
+  %v279 = insertelement <64 x half> %v274, half %v278, i32 55
+  %v280 = getelementptr i32, i32* %a0, i32 56
+  %v281 = load i32, i32* %v280, align 4
+  %v282 = getelementptr half, half* %a1, i32 %v281
+  %v283 = load half, half* %v282, align 4
+  %v284 = insertelement <64 x half> %v279, half %v283, i32 56
+  %v285 = getelementptr i32, i32* %a0, i32 57
+  %v286 = load i32, i32* %v285, align 4
+  %v287 = getelementptr half, half* %a1, i32 %v286
+  %v288 = load half, half* %v287, align 4
+  %v289 = insertelement <64 x half> %v284, half %v288, i32 57
+  %v290 = getelementptr i32, i32* %a0, i32 58
+  %v291 = load i32, i32* %v290, align 4
+  %v292 = getelementptr half, half* %a1, i32 %v291
+  %v293 = load half, half* %v292, align 4
+  %v294 = insertelement <64 x half> %v289, half %v293, i32 58
+  %v295 = getelementptr i32, i32* %a0, i32 59
+  %v296 = load i32, i32* %v295, align 4
+  %v297 = getelementptr half, half* %a1, i32 %v296
+  %v298 = load half, half* %v297, align 4
+  %v299 = insertelement <64 x half> %v294, half %v298, i32 59
+  %v300 = getelementptr i32, i32* %a0, i32 60
+  %v301 = load i32, i32* %v300, align 4
+  %v302 = getelementptr half, half* %a1, i32 %v301
+  %v303 = load half, half* %v302, align 4
+  %v304 = insertelement <64 x half> %v299, half %v303, i32 60
+  %v305 = getelementptr i32, i32* %a0, i32 61
+  %v306 = load i32, i32* %v305, align 4
+  %v307 = getelementptr half, half* %a1, i32 %v306
+  %v308 = load half, half* %v307, align 4
+  %v309 = insertelement <64 x half> %v304, half %v308, i32 61
+  %v310 = getelementptr i32, i32* %a0, i32 62
+  %v311 = load i32, i32* %v310, align 4
+  %v312 = getelementptr half, half* %a1, i32 %v311
+  %v313 = load half, half* %v312, align 4
+  %v314 = insertelement <64 x half> %v309, half %v313, i32 62
+  %v315 = getelementptr i32, i32* %a0, i32 63
+  %v316 = load i32, i32* %v315, align 4
+  %v317 = getelementptr half, half* %a1, i32 %v316
+  %v318 = load half, half* %v317, align 4
+  %v319 = insertelement <64 x half> %v314, half %v318, i32 63
+  ret <64 x half> %v319
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv69" "target-features"="+hvxv69,+hvx-length128b,+hvx-qfloat" }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-128b.ll b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-128b.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-128b.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that vector is produced with vxor
+; CHECK: v{{[0-9]*}} = vxor
+define <32 x i32> @f0(i32 %x) #0 {
+  %vect = insertelement <32 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %x, i32 0
+  ret <32 x i32> %vect
+}
+
+; Check that vector is produced with vsplat
+; CHECK: v{{[0-9]*}} = vsplat
+define <32 x i32> @f1(i32 %x) #0 {
+  %vect = insertelement <32 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 0
+  ret <32 x i32> %vect
+}
+
+; Check that the correct vror is generated
+; CHECK: [[REG0:r([0-9]+)]] = #120
+; CHECK: vror(v{{[0-9]+}},[[REG0]])
+define <32 x i32> @f2(i32 %x) #0 {
+  %vect = insertelement <32 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 2
+  ret <32 x i32> %vect
+}
+
+; Check that the correct vror is generated
+; CHECK: [[REG0:r([0-9]+)]] = #12
+; CHECK: vror(v{{[0-9]+}},[[REG0]])
+define <32 x i32> @f3(i32 %x) #0 {
+  %vect = insertelement <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 undef, i32 1, i32 1>, i32 %x, i32 29
+  ret <32 x i32> %vect
+}
+
+attributes #0 = { readnone nounwind "target-cpu"="hexagonv62" "target-features"="+hvx,+hvx-length128b" }
+
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-64b.ll b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-64b.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-64b.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that vector is produced with vxor
+; CHECK: v{{[0-9]*}} = vxor
+define <16 x i32> @f0(i32 %x) #0 {
+  %vect = insertelement <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %x, i32 0
+  ret <16 x i32> %vect
+}
+
+; Check that vector is produced with vsplat
+; CHECK: v{{[0-9]*}} = vsplat
+define <16 x i32> @f1(i32 %x) #0 {
+  %vect = insertelement <16 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 0
+  ret <16 x i32> %vect
+}
+
+; Check that the correct vror is generated
+; CHECK: [[REG0:r([0-9]+)]] = #56
+; CHECK: vror(v{{[0-9]+}},[[REG0]])
+define <16 x i32> @f2(i32 %x) #0 {
+  %vect = insertelement <16 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 2
+  ret <16 x i32> %vect
+}
+
+; Check that the correct vror is generated
+; CHECK: [[REG0:r([0-9]+)]] = #12
+; CHECK: vror(v{{[0-9]+}},[[REG0]])
+define <16 x i32> @f3(i32 %x) #0 {
+  %vect = insertelement <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 undef, i32 1, i32 1>, i32 %x, i32 13
+  ret <16 x i32> %vect
+}
+
+attributes #0 = { readnone nounwind "target-cpu"="hexagonv62" "target-features"="+hvx,+hvx-length64b" }
+
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/calling-conv.ll b/llvm/test/CodeGen/Hexagon/autohvx/calling-conv.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/calling-conv.ll
@@ -0,0 +1,1528 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+define void @f0(<128 x i8> %a0, <128 x i8>* %a1) #0 {
+; CHECK-LABEL: f0:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a1, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  ret void
+}
+
+define void @f1(<128 x i8> %a0, <128 x i8> %a1, <128 x i8>* %a2) #0 {
+; CHECK-LABEL: f1:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a2, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a2, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  ret void
+}
+
+define void @f2(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8>* %a3) #0 {
+; CHECK-LABEL: f2:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a3, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a3, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a3, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  ret void
+}
+
+define void @f3(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8>* %a4) #0 {
+; CHECK-LABEL: f3:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a4, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a4, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a4, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a4, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  ret void
+}
+
+define void @f4(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8>* %a5) #0 {
+; CHECK-LABEL: f4:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a5, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a5, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a5, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a5, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a5, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  ret void
+}
+
+define void @f5(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8>* %a6) #0 {
+; CHECK-LABEL: f5:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a6, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a6, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a6, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a6, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a6, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a6, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  ret void
+}
+
+define void @f6(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8>* %a7) #0 {
+; CHECK-LABEL: f6:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a7, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a7, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a7, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a7, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a7, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a7, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a7, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  ret void
+}
+
+define void @f7(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8>* %a8) #0 {
+; CHECK-LABEL: f7:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a8, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a8, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a8, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a8, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a8, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a8, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a8, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a8, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  ret void
+}
+
+define void @f8(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8>* %a9) #0 {
+; CHECK-LABEL: f8:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r0,#1024)
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r1+#0) = v8
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a9, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  ret void
+}
+
+define void @f9(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8> %a9, <128 x i8>* %a10) #0 {
+; CHECK-LABEL: f9:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r0,#1024)
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r1+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  %v9 = getelementptr <128 x i8>, <128 x i8>* %a10, i32 9
+  store <128 x i8> %a9, <128 x i8>* %v9, align 128
+  ret void
+}
+
+define void @f10(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8> %a9, <128 x i8> %a10, <128 x i8>* %a11) #0 {
+; CHECK-LABEL: f10:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r0,#1024)
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r1+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  %v9 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 9
+  store <128 x i8> %a9, <128 x i8>* %v9, align 128
+  %v10 = getelementptr <128 x i8>, <128 x i8>* %a11, i32 10
+  store <128 x i8> %a10, <128 x i8>* %v10, align 128
+  ret void
+}
+
+define void @f11(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8> %a9, <128 x i8> %a10, <128 x i8> %a11, <128 x i8>* %a12) #0 {
+; CHECK-LABEL: f11:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r0,#1024)
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r1+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  %v9 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 9
+  store <128 x i8> %a9, <128 x i8>* %v9, align 128
+  %v10 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 10
+  store <128 x i8> %a10, <128 x i8>* %v10, align 128
+  %v11 = getelementptr <128 x i8>, <128 x i8>* %a12, i32 11
+  store <128 x i8> %a11, <128 x i8>* %v11, align 128
+  ret void
+}
+
+define void @f12(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8> %a9, <128 x i8> %a10, <128 x i8> %a11, <128 x i8> %a12, <128 x i8>* %a13) #0 {
+; CHECK-LABEL: f12:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r0,#1024)
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = add(r0,#1536)
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r1+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r5+#0) = v12
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  %v9 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 9
+  store <128 x i8> %a9, <128 x i8>* %v9, align 128
+  %v10 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 10
+  store <128 x i8> %a10, <128 x i8>* %v10, align 128
+  %v11 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 11
+  store <128 x i8> %a11, <128 x i8>* %v11, align 128
+  %v12 = getelementptr <128 x i8>, <128 x i8>* %a13, i32 12
+  store <128 x i8> %a12, <128 x i8>* %v12, align 128
+  ret void
+}
+
+define void @f13(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8> %a9, <128 x i8> %a10, <128 x i8> %a11, <128 x i8> %a12, <128 x i8> %a13, <128 x i8>* %a14) #0 {
+; CHECK-LABEL: f13:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r1 = add(r0,#1024)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1664)
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = add(r0,#1536)
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r1+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r2+#0) = v13
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  %v9 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 9
+  store <128 x i8> %a9, <128 x i8>* %v9, align 128
+  %v10 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 10
+  store <128 x i8> %a10, <128 x i8>* %v10, align 128
+  %v11 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 11
+  store <128 x i8> %a11, <128 x i8>* %v11, align 128
+  %v12 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 12
+  store <128 x i8> %a12, <128 x i8>* %v12, align 128
+  %v13 = getelementptr <128 x i8>, <128 x i8>* %a14, i32 13
+  store <128 x i8> %a13, <128 x i8>* %v13, align 128
+  ret void
+}
+
+define void @f14(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8> %a9, <128 x i8> %a10, <128 x i8> %a11, <128 x i8> %a12, <128 x i8> %a13, <128 x i8> %a14, <128 x i8>* %a15) #0 {
+; CHECK-LABEL: f14:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:     r1 = add(r0,#1024)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1792)
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = add(r0,#1664)
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r4 = add(r0,#1536)
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r1+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v13
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r2+#0) = v14
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  %v9 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 9
+  store <128 x i8> %a9, <128 x i8>* %v9, align 128
+  %v10 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 10
+  store <128 x i8> %a10, <128 x i8>* %v10, align 128
+  %v11 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 11
+  store <128 x i8> %a11, <128 x i8>* %v11, align 128
+  %v12 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 12
+  store <128 x i8> %a12, <128 x i8>* %v12, align 128
+  %v13 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 13
+  store <128 x i8> %a13, <128 x i8>* %v13, align 128
+  %v14 = getelementptr <128 x i8>, <128 x i8>* %a15, i32 14
+  store <128 x i8> %a14, <128 x i8>* %v14, align 128
+  ret void
+}
+
+define void @f15(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8> %a9, <128 x i8> %a10, <128 x i8> %a11, <128 x i8> %a12, <128 x i8> %a13, <128 x i8> %a14, <128 x i8> %a15, <128 x i8>* %a16) #0 {
+; CHECK-LABEL: f15:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r1 = add(r0,#1024)
+; CHECK-NEXT:     r6 = add(r0,#1408)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = add(r0,#1536)
+; CHECK-NEXT:     r4 = add(r0,#1664)
+; CHECK-NEXT:     r2 = add(r0,#1920)
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 = add(r0,#1792)
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r1+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r6+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r5+#0) = v12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v13
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v14
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmem(r2+#0) = v15
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  %v9 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 9
+  store <128 x i8> %a9, <128 x i8>* %v9, align 128
+  %v10 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 10
+  store <128 x i8> %a10, <128 x i8>* %v10, align 128
+  %v11 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 11
+  store <128 x i8> %a11, <128 x i8>* %v11, align 128
+  %v12 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 12
+  store <128 x i8> %a12, <128 x i8>* %v12, align 128
+  %v13 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 13
+  store <128 x i8> %a13, <128 x i8>* %v13, align 128
+  %v14 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 14
+  store <128 x i8> %a14, <128 x i8>* %v14, align 128
+  %v15 = getelementptr <128 x i8>, <128 x i8>* %a16, i32 15
+  store <128 x i8> %a15, <128 x i8>* %v15, align 128
+  ret void
+}
+
+define void @f16(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2, <128 x i8> %a3, <128 x i8> %a4, <128 x i8> %a5, <128 x i8> %a6, <128 x i8> %a7, <128 x i8> %a8, <128 x i8> %a9, <128 x i8> %a10, <128 x i8> %a11, <128 x i8> %a12, <128 x i8> %a13, <128 x i8> %a14, <128 x i8> %a15, <128 x i8> %a16, <128 x i8>* %a17) #0 {
+; CHECK-LABEL: f16:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r30,#8)
+; CHECK-NEXT:     r7 = add(r0,#1024)
+; CHECK-NEXT:     r6 = add(r0,#1536)
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = add(r0,#1664)
+; CHECK-NEXT:     r3 = add(r0,#1920)
+; CHECK-NEXT:     r2 = add(r0,#2048)
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r4 = add(r0,#1792)
+; CHECK-NEXT:     r29 = and(r29,#-128)
+; CHECK-NEXT:     v16 = vmem(r1+#0)
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r7+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r6+#0) = v12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r5+#0) = v13
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v14
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v15
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r2+#0) = v16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 0
+  store <128 x i8> %a0, <128 x i8>* %v0, align 128
+  %v1 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 1
+  store <128 x i8> %a1, <128 x i8>* %v1, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 2
+  store <128 x i8> %a2, <128 x i8>* %v2, align 128
+  %v3 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 3
+  store <128 x i8> %a3, <128 x i8>* %v3, align 128
+  %v4 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 4
+  store <128 x i8> %a4, <128 x i8>* %v4, align 128
+  %v5 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 5
+  store <128 x i8> %a5, <128 x i8>* %v5, align 128
+  %v6 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 6
+  store <128 x i8> %a6, <128 x i8>* %v6, align 128
+  %v7 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 7
+  store <128 x i8> %a7, <128 x i8>* %v7, align 128
+  %v8 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 8
+  store <128 x i8> %a8, <128 x i8>* %v8, align 128
+  %v9 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 9
+  store <128 x i8> %a9, <128 x i8>* %v9, align 128
+  %v10 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 10
+  store <128 x i8> %a10, <128 x i8>* %v10, align 128
+  %v11 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 11
+  store <128 x i8> %a11, <128 x i8>* %v11, align 128
+  %v12 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 12
+  store <128 x i8> %a12, <128 x i8>* %v12, align 128
+  %v13 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 13
+  store <128 x i8> %a13, <128 x i8>* %v13, align 128
+  %v14 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 14
+  store <128 x i8> %a14, <128 x i8>* %v14, align 128
+  %v15 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 15
+  store <128 x i8> %a15, <128 x i8>* %v15, align 128
+  %v16 = getelementptr <128 x i8>, <128 x i8>* %a17, i32 16
+  store <128 x i8> %a16, <128 x i8>* %v16, align 128
+  ret void
+}
+
+define void @f17(<64 x i16> %a0, <64 x i16> %a1, <64 x i16> %a2, <64 x i16> %a3, <64 x i16> %a4, <64 x i16> %a5, <64 x i16> %a6, <64 x i16> %a7, <64 x i16> %a8, <64 x i16> %a9, <64 x i16> %a10, <64 x i16> %a11, <64 x i16> %a12, <64 x i16> %a13, <64 x i16> %a14, <64 x i16> %a15, <64 x i16> %a16, <64 x i16>* %a17) #0 {
+; CHECK-LABEL: f17:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r30,#8)
+; CHECK-NEXT:     r7 = add(r0,#1024)
+; CHECK-NEXT:     r6 = add(r0,#1536)
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = add(r0,#1664)
+; CHECK-NEXT:     r3 = add(r0,#1920)
+; CHECK-NEXT:     r2 = add(r0,#2048)
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r4 = add(r0,#1792)
+; CHECK-NEXT:     r29 = and(r29,#-128)
+; CHECK-NEXT:     v16 = vmem(r1+#0)
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r7+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r6+#0) = v12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r5+#0) = v13
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v14
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v15
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r2+#0) = v16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 0
+  store <64 x i16> %a0, <64 x i16>* %v0, align 128
+  %v1 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 1
+  store <64 x i16> %a1, <64 x i16>* %v1, align 128
+  %v2 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 2
+  store <64 x i16> %a2, <64 x i16>* %v2, align 128
+  %v3 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 3
+  store <64 x i16> %a3, <64 x i16>* %v3, align 128
+  %v4 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 4
+  store <64 x i16> %a4, <64 x i16>* %v4, align 128
+  %v5 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 5
+  store <64 x i16> %a5, <64 x i16>* %v5, align 128
+  %v6 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 6
+  store <64 x i16> %a6, <64 x i16>* %v6, align 128
+  %v7 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 7
+  store <64 x i16> %a7, <64 x i16>* %v7, align 128
+  %v8 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 8
+  store <64 x i16> %a8, <64 x i16>* %v8, align 128
+  %v9 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 9
+  store <64 x i16> %a9, <64 x i16>* %v9, align 128
+  %v10 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 10
+  store <64 x i16> %a10, <64 x i16>* %v10, align 128
+  %v11 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 11
+  store <64 x i16> %a11, <64 x i16>* %v11, align 128
+  %v12 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 12
+  store <64 x i16> %a12, <64 x i16>* %v12, align 128
+  %v13 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 13
+  store <64 x i16> %a13, <64 x i16>* %v13, align 128
+  %v14 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 14
+  store <64 x i16> %a14, <64 x i16>* %v14, align 128
+  %v15 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 15
+  store <64 x i16> %a15, <64 x i16>* %v15, align 128
+  %v16 = getelementptr <64 x i16>, <64 x i16>* %a17, i32 16
+  store <64 x i16> %a16, <64 x i16>* %v16, align 128
+  ret void
+}
+
+define void @f18(<32 x i32> %a0, <32 x i32> %a1, <32 x i32> %a2, <32 x i32> %a3, <32 x i32> %a4, <32 x i32> %a5, <32 x i32> %a6, <32 x i32> %a7, <32 x i32> %a8, <32 x i32> %a9, <32 x i32> %a10, <32 x i32> %a11, <32 x i32> %a12, <32 x i32> %a13, <32 x i32> %a14, <32 x i32> %a15, <32 x i32> %a16, <32 x i32>* %a17) #0 {
+; CHECK-LABEL: f18:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r30,#8)
+; CHECK-NEXT:     r7 = add(r0,#1024)
+; CHECK-NEXT:     r6 = add(r0,#1536)
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = add(r0,#1664)
+; CHECK-NEXT:     r3 = add(r0,#1920)
+; CHECK-NEXT:     r2 = add(r0,#2048)
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r4 = add(r0,#1792)
+; CHECK-NEXT:     r29 = and(r29,#-128)
+; CHECK-NEXT:     v16 = vmem(r1+#0)
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r7+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r6+#0) = v12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r5+#0) = v13
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v14
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v15
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r2+#0) = v16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 0
+  store <32 x i32> %a0, <32 x i32>* %v0, align 128
+  %v1 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 1
+  store <32 x i32> %a1, <32 x i32>* %v1, align 128
+  %v2 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 2
+  store <32 x i32> %a2, <32 x i32>* %v2, align 128
+  %v3 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 3
+  store <32 x i32> %a3, <32 x i32>* %v3, align 128
+  %v4 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 4
+  store <32 x i32> %a4, <32 x i32>* %v4, align 128
+  %v5 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 5
+  store <32 x i32> %a5, <32 x i32>* %v5, align 128
+  %v6 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 6
+  store <32 x i32> %a6, <32 x i32>* %v6, align 128
+  %v7 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 7
+  store <32 x i32> %a7, <32 x i32>* %v7, align 128
+  %v8 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 8
+  store <32 x i32> %a8, <32 x i32>* %v8, align 128
+  %v9 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 9
+  store <32 x i32> %a9, <32 x i32>* %v9, align 128
+  %v10 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 10
+  store <32 x i32> %a10, <32 x i32>* %v10, align 128
+  %v11 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 11
+  store <32 x i32> %a11, <32 x i32>* %v11, align 128
+  %v12 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 12
+  store <32 x i32> %a12, <32 x i32>* %v12, align 128
+  %v13 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 13
+  store <32 x i32> %a13, <32 x i32>* %v13, align 128
+  %v14 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 14
+  store <32 x i32> %a14, <32 x i32>* %v14, align 128
+  %v15 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 15
+  store <32 x i32> %a15, <32 x i32>* %v15, align 128
+  %v16 = getelementptr <32 x i32>, <32 x i32>* %a17, i32 16
+  store <32 x i32> %a16, <32 x i32>* %v16, align 128
+  ret void
+}
+
+define void @f19(<64 x half> %a0, <64 x half> %a1, <64 x half> %a2, <64 x half> %a3, <64 x half> %a4, <64 x half> %a5, <64 x half> %a6, <64 x half> %a7, <64 x half> %a8, <64 x half> %a9, <64 x half> %a10, <64 x half> %a11, <64 x half> %a12, <64 x half> %a13, <64 x half> %a14, <64 x half> %a15, <64 x half> %a16, <64 x half>* %a17) #0 {
+; CHECK-LABEL: f19:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r30,#8)
+; CHECK-NEXT:     r7 = add(r0,#1024)
+; CHECK-NEXT:     r6 = add(r0,#1536)
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = add(r0,#1664)
+; CHECK-NEXT:     r3 = add(r0,#1920)
+; CHECK-NEXT:     r2 = add(r0,#2048)
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r4 = add(r0,#1792)
+; CHECK-NEXT:     r29 = and(r29,#-128)
+; CHECK-NEXT:     v16 = vmem(r1+#0)
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r7+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r6+#0) = v12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r5+#0) = v13
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v14
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v15
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r2+#0) = v16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <64 x half>, <64 x half>* %a17, i32 0
+  store <64 x half> %a0, <64 x half>* %v0, align 128
+  %v1 = getelementptr <64 x half>, <64 x half>* %a17, i32 1
+  store <64 x half> %a1, <64 x half>* %v1, align 128
+  %v2 = getelementptr <64 x half>, <64 x half>* %a17, i32 2
+  store <64 x half> %a2, <64 x half>* %v2, align 128
+  %v3 = getelementptr <64 x half>, <64 x half>* %a17, i32 3
+  store <64 x half> %a3, <64 x half>* %v3, align 128
+  %v4 = getelementptr <64 x half>, <64 x half>* %a17, i32 4
+  store <64 x half> %a4, <64 x half>* %v4, align 128
+  %v5 = getelementptr <64 x half>, <64 x half>* %a17, i32 5
+  store <64 x half> %a5, <64 x half>* %v5, align 128
+  %v6 = getelementptr <64 x half>, <64 x half>* %a17, i32 6
+  store <64 x half> %a6, <64 x half>* %v6, align 128
+  %v7 = getelementptr <64 x half>, <64 x half>* %a17, i32 7
+  store <64 x half> %a7, <64 x half>* %v7, align 128
+  %v8 = getelementptr <64 x half>, <64 x half>* %a17, i32 8
+  store <64 x half> %a8, <64 x half>* %v8, align 128
+  %v9 = getelementptr <64 x half>, <64 x half>* %a17, i32 9
+  store <64 x half> %a9, <64 x half>* %v9, align 128
+  %v10 = getelementptr <64 x half>, <64 x half>* %a17, i32 10
+  store <64 x half> %a10, <64 x half>* %v10, align 128
+  %v11 = getelementptr <64 x half>, <64 x half>* %a17, i32 11
+  store <64 x half> %a11, <64 x half>* %v11, align 128
+  %v12 = getelementptr <64 x half>, <64 x half>* %a17, i32 12
+  store <64 x half> %a12, <64 x half>* %v12, align 128
+  %v13 = getelementptr <64 x half>, <64 x half>* %a17, i32 13
+  store <64 x half> %a13, <64 x half>* %v13, align 128
+  %v14 = getelementptr <64 x half>, <64 x half>* %a17, i32 14
+  store <64 x half> %a14, <64 x half>* %v14, align 128
+  %v15 = getelementptr <64 x half>, <64 x half>* %a17, i32 15
+  store <64 x half> %a15, <64 x half>* %v15, align 128
+  %v16 = getelementptr <64 x half>, <64 x half>* %a17, i32 16
+  store <64 x half> %a16, <64 x half>* %v16, align 128
+  ret void
+}
+
+define void @f20(<32 x float> %a0, <32 x float> %a1, <32 x float> %a2, <32 x float> %a3, <32 x float> %a4, <32 x float> %a5, <32 x float> %a6, <32 x float> %a7, <32 x float> %a8, <32 x float> %a9, <32 x float> %a10, <32 x float> %a11, <32 x float> %a12, <32 x float> %a13, <32 x float> %a14, <32 x float> %a15, <32 x float> %a16, <32 x float>* %a17) #0 {
+; CHECK-LABEL: f20:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r0,#1152)
+; CHECK-NEXT:     r3 = add(r0,#1280)
+; CHECK-NEXT:     r4 = add(r0,#1408)
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r30,#8)
+; CHECK-NEXT:     r7 = add(r0,#1024)
+; CHECK-NEXT:     r6 = add(r0,#1536)
+; CHECK-NEXT:     vmem(r2+#0) = v9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 = add(r0,#1664)
+; CHECK-NEXT:     r3 = add(r0,#1920)
+; CHECK-NEXT:     r2 = add(r0,#2048)
+; CHECK-NEXT:     vmem(r3+#0) = v10
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r4 = add(r0,#1792)
+; CHECK-NEXT:     r29 = and(r29,#-128)
+; CHECK-NEXT:     v16 = vmem(r1+#0)
+; CHECK-NEXT:     vmem(r4+#0) = v11
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#1) = v1
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r7+#0) = v8
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#2) = v2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#3) = v3
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#4) = v4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r6+#0) = v12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#5) = v5
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r5+#0) = v13
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#6) = v6
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r4+#0) = v14
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r0+#7) = v7
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r3+#0) = v15
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     vmem(r2+#0) = v16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+b0:
+  %v0 = getelementptr <32 x float>, <32 x float>* %a17, i32 0
+  store <32 x float> %a0, <32 x float>* %v0, align 128
+  %v1 = getelementptr <32 x float>, <32 x float>* %a17, i32 1
+  store <32 x float> %a1, <32 x float>* %v1, align 128
+  %v2 = getelementptr <32 x float>, <32 x float>* %a17, i32 2
+  store <32 x float> %a2, <32 x float>* %v2, align 128
+  %v3 = getelementptr <32 x float>, <32 x float>* %a17, i32 3
+  store <32 x float> %a3, <32 x float>* %v3, align 128
+  %v4 = getelementptr <32 x float>, <32 x float>* %a17, i32 4
+  store <32 x float> %a4, <32 x float>* %v4, align 128
+  %v5 = getelementptr <32 x float>, <32 x float>* %a17, i32 5
+  store <32 x float> %a5, <32 x float>* %v5, align 128
+  %v6 = getelementptr <32 x float>, <32 x float>* %a17, i32 6
+  store <32 x float> %a6, <32 x float>* %v6, align 128
+  %v7 = getelementptr <32 x float>, <32 x float>* %a17, i32 7
+  store <32 x float> %a7, <32 x float>* %v7, align 128
+  %v8 = getelementptr <32 x float>, <32 x float>* %a17, i32 8
+  store <32 x float> %a8, <32 x float>* %v8, align 128
+  %v9 = getelementptr <32 x float>, <32 x float>* %a17, i32 9
+  store <32 x float> %a9, <32 x float>* %v9, align 128
+  %v10 = getelementptr <32 x float>, <32 x float>* %a17, i32 10
+  store <32 x float> %a10, <32 x float>* %v10, align 128
+  %v11 = getelementptr <32 x float>, <32 x float>* %a17, i32 11
+  store <32 x float> %a11, <32 x float>* %v11, align 128
+  %v12 = getelementptr <32 x float>, <32 x float>* %a17, i32 12
+  store <32 x float> %a12, <32 x float>* %v12, align 128
+  %v13 = getelementptr <32 x float>, <32 x float>* %a17, i32 13
+  store <32 x float> %a13, <32 x float>* %v13, align 128
+  %v14 = getelementptr <32 x float>, <32 x float>* %a17, i32 14
+  store <32 x float> %a14, <32 x float>* %v14, align 128
+  %v15 = getelementptr <32 x float>, <32 x float>* %a17, i32 15
+  store <32 x float> %a15, <32 x float>* %v15, align 128
+  %v16 = getelementptr <32 x float>, <32 x float>* %a17, i32 16
+  store <32 x float> %a16, <32 x float>* %v16, align 128
+  ret void
+}
+
+define <128 x i8> @f21() #0 {
+; CHECK-LABEL: f21:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vxor(v0,v0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <128 x i8> undef, i8 0, i32 0
+  %v1 = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> zeroinitializer
+  ret <128 x i8> %v1
+}
+
+define <256 x i8> @f22() #0 {
+; CHECK-LABEL: f22:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1:0.w = vsub(v1:0.w,v1:0.w)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <256 x i8> undef, i8 0, i32 0
+  %v1 = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> zeroinitializer
+  ret <256 x i8> %v1
+}
+
+define <64 x i16> @f23() #0 {
+; CHECK-LABEL: f23:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vxor(v0,v0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <64 x i16> undef, i16 0, i32 0
+  %v1 = shufflevector <64 x i16> %v0, <64 x i16> undef, <64 x i32> zeroinitializer
+  ret <64 x i16> %v1
+}
+
+define <128 x i16> @f24() #0 {
+; CHECK-LABEL: f24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1:0.w = vsub(v1:0.w,v1:0.w)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <128 x i16> undef, i16 0, i32 0
+  %v1 = shufflevector <128 x i16> %v0, <128 x i16> undef, <128 x i32> zeroinitializer
+  ret <128 x i16> %v1
+}
+
+define <32 x i32> @f25() #0 {
+; CHECK-LABEL: f25:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vxor(v0,v0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <32 x i32> undef, i32 0, i32 0
+  %v1 = shufflevector <32 x i32> %v0, <32 x i32> undef, <32 x i32> zeroinitializer
+  ret <32 x i32> %v1
+}
+
+define <64 x i32> @f26() #0 {
+; CHECK-LABEL: f26:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1:0.w = vsub(v1:0.w,v1:0.w)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <64 x i32> undef, i32 0, i32 0
+  %v1 = shufflevector <64 x i32> %v0, <64 x i32> undef, <64 x i32> zeroinitializer
+  ret <64 x i32> %v1
+}
+
+define <64 x half> @f27() #0 {
+; CHECK-LABEL: f27:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vxor(v0,v0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <64 x half> undef, half 0xH0, i32 0
+  %v1 = shufflevector <64 x half> %v0, <64 x half> undef, <64 x i32> zeroinitializer
+  ret <64 x half> %v1
+}
+
+define <128 x half> @f28() #0 {
+; CHECK-LABEL: f28:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1:0.w = vsub(v1:0.w,v1:0.w)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <128 x half> undef, half 0xH0, i32 0
+  %v1 = shufflevector <128 x half> %v0, <128 x half> undef, <128 x i32> zeroinitializer
+  ret <128 x half> %v1
+}
+
+define <32 x float> @f29() #0 {
+; CHECK-LABEL: f29:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vxor(v0,v0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <32 x float> undef, float 0.0, i32 0
+  %v1 = shufflevector <32 x float> %v0, <32 x float> undef, <32 x i32> zeroinitializer
+  ret <32 x float> %v1
+}
+
+define <64 x float> @f30() #0 {
+; CHECK-LABEL: f30:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1:0.w = vsub(v1:0.w,v1:0.w)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <64 x float> undef, float 0.0, i32 0
+  %v1 = shufflevector <64 x float> %v0, <64 x float> undef, <64 x i32> zeroinitializer
+  ret <64 x float> %v1
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv69" "target-features"="+hvxv69,+hvx-length128b,+hvx-qfloat" }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fsplat.ll b/llvm/test/CodeGen/Hexagon/autohvx/fsplat.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/fsplat.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+
+; Check that the vsplat instruction is generated
+; CHECK: r[[V:[0-9]+]] = ##1092616192
+; CHECK: vsplat(r[[V]])
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+; Function Attrs: nofree norecurse nounwind writeonly
+define dso_local i32 @foo(float* nocapture %0, i32 %1) local_unnamed_addr #0 {
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %4, label %22
+
+4:                                                ; preds = %2
+  %5 = icmp ult i32 %1, 64
+  br i1 %5, label %6, label %9
+
+6:                                                ; preds = %20, %4
+  %7 = phi float* [ %0, %4 ], [ %11, %20 ]
+  %8 = phi i32 [ 0, %4 ], [ %10, %20 ]
+  br label %23
+
+9:                                                ; preds = %4
+  %10 = and i32 %1, -64
+  %11 = getelementptr float, float* %0, i32 %10
+  br label %12
+
+12:                                               ; preds = %12, %9
+  %13 = phi i32 [ 0, %9 ], [ %18, %12 ]
+  %14 = getelementptr float, float* %0, i32 %13
+  %15 = bitcast float* %14 to <32 x float>*
+  store <32 x float> <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>, <32 x float>* %15, align 4
+  %16 = getelementptr float, float* %14, i32 32
+  %17 = bitcast float* %16 to <32 x float>*
+  store <32 x float> <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>, <32 x float>* %17, align 4
+  %18 = add i32 %13, 64
+  %19 = icmp eq i32 %18, %10
+  br i1 %19, label %20, label %12
+
+20:                                               ; preds = %12
+  %21 = icmp eq i32 %10, %1
+  br i1 %21, label %22, label %6
+
+22:                                               ; preds = %23, %20, %2
+  ret i32 0
+
+23:                                               ; preds = %23, %6
+  %24 = phi float* [ %28, %23 ], [ %7, %6 ]
+  %25 = phi i32 [ %26, %23 ], [ %8, %6 ]
+  store float 1.000000e+01, float* %24, align 4
+  %26 = add nuw nsw i32 %25, 1
+  %27 = icmp eq i32 %26, %1
+  %28 = getelementptr float, float* %24, i32 1
+  br i1 %27, label %22, label %23
+}
+
+attributes #0 = { nofree norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv69" "target-features"="+hvx-length128b,+hvxv69,+v69,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/hfnosplat_cp.ll b/llvm/test/CodeGen/Hexagon/autohvx/hfnosplat_cp.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/hfnosplat_cp.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+
+; Check that the vsplat instruction is generated
+; CHECK: .word 1097875824
+; CHECK: .word 1048133241
+; CHECK: .word 0
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+; Function Attrs: nofree norecurse nounwind writeonly
+define dso_local i32 @foo(half* nocapture %a) local_unnamed_addr #0 {
+vector.body:
+  %0 = bitcast half* %a to <40 x half>*
+  store <40 x half> <half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79>, <40 x half>* %0, align 2
+  ret i32 0
+}
+
+attributes #0 = { nofree norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv69" "target-features"="+hvx-length128b,+hvxv69,+v69,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/hfsplat.ll b/llvm/test/CodeGen/Hexagon/autohvx/hfsplat.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/hfsplat.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+
+; Check that the vsplat instruction is generated
+; CHECK: r[[V:[0-9]+]] = #16752
+; CHECK: vsplat(r[[V]])
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+; Function Attrs: nofree norecurse nounwind writeonly
+define dso_local i32 @foo(half* nocapture %0, i32 %1) local_unnamed_addr #0 {
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %4, label %22
+
+4:                                                ; preds = %2
+  %5 = icmp ult i32 %1, 128
+  br i1 %5, label %6, label %9
+
+6:                                                ; preds = %20, %4
+  %7 = phi half* [ %0, %4 ], [ %11, %20 ]
+  %8 = phi i32 [ 0, %4 ], [ %10, %20 ]
+  br label %23
+
+9:                                                ; preds = %4
+  %10 = and i32 %1, -128
+  %11 = getelementptr half, half* %0, i32 %10
+  br label %12
+
+12:                                               ; preds = %12, %9
+  %13 = phi i32 [ 0, %9 ], [ %18, %12 ]
+  %14 = getelementptr half, half* %0, i32 %13
+  %15 = bitcast half* %14 to <64 x half>*
+  store <64 x half> <half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170>, <64 x half>* %15, align 2
+  %16 = getelementptr half, half* %14, i32 64
+  %17 = bitcast half* %16 to <64 x half>*
+  store <64 x half> <half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170>, <64 x half>* %17, align 2
+  %18 = add i32 %13, 128
+  %19 = icmp eq i32 %18, %10
+  br i1 %19, label %20, label %12
+
+20:                                               ; preds = %12
+  %21 = icmp eq i32 %10, %1
+  br i1 %21, label %22, label %6
+
+22:                                               ; preds = %23, %20, %2
+  ret i32 0
+
+23:                                               ; preds = %23, %6
+  %24 = phi half* [ %28, %23 ], [ %7, %6 ]
+  %25 = phi i32 [ %26, %23 ], [ %8, %6 ]
+  store half 0xH4170, half* %24, align 2
+  %26 = add nuw nsw i32 %25, 1
+  %27 = icmp eq i32 %26, %1
+  %28 = getelementptr half, half* %24, i32 1
+  br i1 %27, label %22, label %23
+}
+
+attributes #0 = { nofree norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv69" "target-features"="+hvx-length128b,+hvxv69,+v69,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+define <32 x i32> @fred(i32 %a0) #0 {
+; CHECK-LABEL: fred:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3:2 = combine(#76,#7)
+; CHECK-NEXT:     r1 = #12
+; CHECK-NEXT:     r4 = #9
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vror(v0,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0.w = vinsert(r2)
+; CHECK-NEXT:     r2 = #20
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vror(v0,r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0.w = vinsert(r4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vror(v0,r2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0.w = vinsert(r0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vror(v0,r2)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <32 x i32> undef, i32 undef, i32 0
+  %v1 = insertelement <32 x i32> %v0, i32 undef, i32 1
+  %v2 = insertelement <32 x i32> %v1, i32 undef, i32 2
+  %v3 = insertelement <32 x i32> %v2, i32 7, i32 3
+  %v4 = insertelement <32 x i32> %v3, i32 undef, i32 4
+  %v5 = insertelement <32 x i32> %v4, i32 undef, i32 5
+  %v6 = insertelement <32 x i32> %v5, i32 undef, i32 6
+  %v7 = insertelement <32 x i32> %v6, i32 undef, i32 7
+  %v8 = insertelement <32 x i32> %v7, i32 undef, i32 8
+  %v9 = insertelement <32 x i32> %v8, i32 undef, i32 9
+  %v10 = insertelement <32 x i32> %v9, i32 undef, i32 10
+  %v11 = insertelement <32 x i32> %v10, i32 undef, i32 11
+  %v12 = insertelement <32 x i32> %v11, i32 undef, i32 12
+  %v13 = insertelement <32 x i32> %v12, i32 undef, i32 13
+  %v14 = insertelement <32 x i32> %v13, i32 undef, i32 14
+  %v15 = insertelement <32 x i32> %v14, i32 undef, i32 15
+  %v16 = insertelement <32 x i32> %v15, i32 undef, i32 16
+  %v17 = insertelement <32 x i32> %v16, i32 undef, i32 17
+  %v18 = insertelement <32 x i32> %v17, i32 undef, i32 18
+  %v19 = insertelement <32 x i32> %v18, i32 undef, i32 19
+  %v20 = insertelement <32 x i32> %v19, i32 undef, i32 20
+  %v21 = insertelement <32 x i32> %v20, i32 undef, i32 21
+  %v22 = insertelement <32 x i32> %v21, i32 9, i32 22
+  %v23 = insertelement <32 x i32> %v22, i32 undef, i32 23
+  %v24 = insertelement <32 x i32> %v23, i32 undef, i32 24
+  %v25 = insertelement <32 x i32> %v24, i32 undef, i32 25
+  %v26 = insertelement <32 x i32> %v25, i32 undef, i32 26
+  %v27 = insertelement <32 x i32> %v26, i32 %a0, i32 27
+  %v28 = insertelement <32 x i32> %v27, i32 undef, i32 28
+  %v29 = insertelement <32 x i32> %v28, i32 undef, i32 29
+  %v30 = insertelement <32 x i32> %v29, i32 undef, i32 30
+  %v31 = insertelement <32 x i32> %v30, i32 undef, i32 31
+  ret <32 x i32> %v31
+}
+
+attributes #0 = { "target-cpu"="hexagonv66" "target-features"="+hvx,+hvx-length128b" }
+
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/splat.ll b/llvm/test/CodeGen/Hexagon/autohvx/splat.ll
--- a/llvm/test/CodeGen/Hexagon/autohvx/splat.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/splat.ll
@@ -397,5 +397,34 @@
   ret <64 x i32> %v1
 }
 
+; Splat register, 16 bit fp, v68+
+define <64 x half> @f24(i16 %a0) #2 {
+; CHECK-LABEL: f24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0.h = vsplat(r0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = bitcast i16 %a0 to half
+  %v1 = insertelement <64 x half> undef, half %v0, i32 0
+  %v2 = shufflevector <64 x half> %v1, <64 x half> undef, <64 x i32> zeroinitializer
+  ret <64 x half> %v2
+}
+
+; Splat register, 32 bit fp, v68+
+define <32 x float> @f25(float %a0) #2 {
+; CHECK-LABEL: f25:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vsplat(r0)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <32 x float> undef, float %a0, i32 0
+  %v1 = shufflevector <32 x float> %v0, <32 x float> undef, <32 x i32> zeroinitializer
+  ret <32 x float> %v1
+}
+
+
 attributes #0 = { nounwind readnone "target-cpu"="hexagonv60" "target-features"="+hvxv60,+hvx-length128b" }
 attributes #1 = { nounwind readnone "target-cpu"="hexagonv62" "target-features"="+hvxv62,+hvx-length128b" }
+attributes #2 = { nounwind readnone "target-cpu"="hexagonv69" "target-features"="+hvxv69,+hvx-length128b,+hvx-qfloat" }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-load-store-basic.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-load-store-basic.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-load-store-basic.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+define void @f0(<128 x i8>* %a0, <128 x i8>* %a1) #0 {
+; CHECK-LABEL: f0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     vmem(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a0, i32 1
+  %v1 = load <128 x i8>, <128 x i8>* %v0, align 128
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a1, i32 2
+  store <128 x i8> %v1, <128 x i8>* %v2, align 128
+  ret void
+}
+
+define void @f1(<64 x i16>* %a0, <64 x i16>* %a1) #0 {
+; CHECK-LABEL: f1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     vmem(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <64 x i16>, <64 x i16>* %a0, i32 1
+  %v1 = load <64 x i16>, <64 x i16>* %v0, align 128
+  %v2 = getelementptr <64 x i16>, <64 x i16>* %a1, i32 2
+  store <64 x i16> %v1, <64 x i16>* %v2, align 128
+  ret void
+}
+
+define void @f2(<32 x i32>* %a0, <32 x i32>* %a1) #0 {
+; CHECK-LABEL: f2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     vmem(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <32 x i32>, <32 x i32>* %a0, i32 1
+  %v1 = load <32 x i32>, <32 x i32>* %v0, align 128
+  %v2 = getelementptr <32 x i32>, <32 x i32>* %a1, i32 2
+  store <32 x i32> %v1, <32 x i32>* %v2, align 128
+  ret void
+}
+
+define void @f3(<64 x half>* %a0, <64 x half>* %a1) #0 {
+; CHECK-LABEL: f3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     vmem(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <64 x half>, <64 x half>* %a0, i32 1
+  %v1 = load <64 x half>, <64 x half>* %v0, align 128
+  %v2 = getelementptr <64 x half>, <64 x half>* %a1, i32 2
+  store <64 x half> %v1, <64 x half>* %v2, align 128
+  ret void
+}
+
+define void @f4(<32 x float>* %a0, <32 x float>* %a1) #0 {
+; CHECK-LABEL: f4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     v0.cur = vmem(r0+#1)
+; CHECK-NEXT:     vmem(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <32 x float>, <32 x float>* %a0, i32 1
+  %v1 = load <32 x float>, <32 x float>* %v0, align 128
+  %v2 = getelementptr <32 x float>, <32 x float>* %a1, i32 2
+  store <32 x float> %v1, <32 x float>* %v2, align 128
+  ret void
+}
+
+define void @f5(<128 x i8>* %a0, <128 x i8>* %a1) #0 {
+; CHECK-LABEL: f5:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vmemu(r0+#1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmemu(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <128 x i8>, <128 x i8>* %a0, i32 1
+  %v1 = load <128 x i8>, <128 x i8>* %v0, align 1
+  %v2 = getelementptr <128 x i8>, <128 x i8>* %a1, i32 2
+  store <128 x i8> %v1, <128 x i8>* %v2, align 1
+  ret void
+}
+
+define void @f6(<64 x i16>* %a0, <64 x i16>* %a1) #0 {
+; CHECK-LABEL: f6:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vmemu(r0+#1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmemu(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <64 x i16>, <64 x i16>* %a0, i32 1
+  %v1 = load <64 x i16>, <64 x i16>* %v0, align 1
+  %v2 = getelementptr <64 x i16>, <64 x i16>* %a1, i32 2
+  store <64 x i16> %v1, <64 x i16>* %v2, align 1
+  ret void
+}
+
+define void @f7(<32 x i32>* %a0, <32 x i32>* %a1) #0 {
+; CHECK-LABEL: f7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vmemu(r0+#1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmemu(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <32 x i32>, <32 x i32>* %a0, i32 1
+  %v1 = load <32 x i32>, <32 x i32>* %v0, align 1
+  %v2 = getelementptr <32 x i32>, <32 x i32>* %a1, i32 2
+  store <32 x i32> %v1, <32 x i32>* %v2, align 1
+  ret void
+}
+
+define void @f8(<64 x half>* %a0, <64 x half>* %a1) #0 {
+; CHECK-LABEL: f8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vmemu(r0+#1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmemu(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <64 x half>, <64 x half>* %a0, i32 1
+  %v1 = load <64 x half>, <64 x half>* %v0, align 1
+  %v2 = getelementptr <64 x half>, <64 x half>* %a1, i32 2
+  store <64 x half> %v1, <64 x half>* %v2, align 1
+  ret void
+}
+
+define void @f9(<32 x float>* %a0, <32 x float>* %a1) #0 {
+; CHECK-LABEL: f9:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vmemu(r0+#1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:     vmemu(r1+#2) = v0
+; CHECK-NEXT:    }
+  %v0 = getelementptr <32 x float>, <32 x float>* %a0, i32 1
+  %v1 = load <32 x float>, <32 x float>* %v0, align 1
+  %v2 = getelementptr <32 x float>, <32 x float>* %a1, i32 2
+  store <32 x float> %v1, <32 x float>* %v2, align 1
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv69" "target-features"="+hvxv69,+hvx-length128b,+hvx-qfloat" }
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -2116,7 +2116,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -2264,7 +2264,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -2412,7 +2412,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -2560,7 +2560,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -2708,7 +2708,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -2856,7 +2856,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -3004,7 +3004,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -3152,7 +3152,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -3300,7 +3300,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -3448,7 +3448,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -6997,7 +6997,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -7147,7 +7147,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -7297,7 +7297,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -7447,7 +7447,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -7597,7 +7597,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -7747,7 +7747,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -7897,7 +7897,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -8047,7 +8047,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -8197,7 +8197,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -8347,7 +8347,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -696,7 +696,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -850,7 +850,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    li a4, 255
 ; RV64IA-NEXT:    sllw a7, a4, a0
@@ -1753,7 +1753,7 @@
 ; RV64IA-LABEL: atomicrmw_max_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
@@ -1909,7 +1909,7 @@
 ; RV64IA-LABEL: atomicrmw_min_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a3, a0, 24
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
diff --git a/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll
--- a/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll
@@ -173,7 +173,7 @@
 ; RV64I-NEXT:    andi a0, a0, 51
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    andi a0, a0, 15
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB3_2:
@@ -596,7 +596,7 @@
 ; RV64I-NEXT:    andi a0, a0, 51
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    andi a0, a0, 15
 ; RV64I-NEXT:    ret
   %tmp = call i8 @llvm.cttz.i8(i8 %a, i1 true)
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -0,0 +1,1015 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32,RV32IM %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-zba,+experimental-zbb \
+; RUN:    -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32,RV32IMZB %s
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64,RV64IM %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-zba,+experimental-zbb \
+; RUN:   -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64,RV64IMZB %s
+
+; Test that there is a single shift after the mul and no addition.
+define i32 @udiv_constant_no_add(i32 %a) nounwind {
+; RV32-LABEL: udiv_constant_no_add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 838861
+; RV32-NEXT:    addi a1, a1, -819
+; RV32-NEXT:    mulhu a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    ret
+;
+; RV64IM-LABEL: udiv_constant_no_add:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 32
+; RV64IM-NEXT:    lui a1, 838861
+; RV64IM-NEXT:    addiw a1, a1, -819
+; RV64IM-NEXT:    slli a1, a1, 32
+; RV64IM-NEXT:    mulhu a0, a0, a1
+; RV64IM-NEXT:    srli a0, a0, 34
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: udiv_constant_no_add:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    zext.w a0, a0
+; RV64IMZB-NEXT:    lui a1, 838861
+; RV64IMZB-NEXT:    addiw a1, a1, -819
+; RV64IMZB-NEXT:    zext.w a1, a1
+; RV64IMZB-NEXT:    mul a0, a0, a1
+; RV64IMZB-NEXT:    srli a0, a0, 34
+; RV64IMZB-NEXT:    ret
+  %1 = udiv i32 %a, 5
+  ret i32 %1
+}
+
+; This constant requires a sub, shrli, add sequence after the mul.
+define i32 @udiv_constant_add(i32 %a) nounwind {
+; RV32-LABEL: udiv_constant_add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 149797
+; RV32-NEXT:    addi a1, a1, -1755
+; RV32-NEXT:    mulhu a1, a0, a1
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    ret
+;
+; RV64IM-LABEL: udiv_constant_add:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a0, 32
+; RV64IM-NEXT:    lui a2, 149797
+; RV64IM-NEXT:    addiw a2, a2, -1755
+; RV64IM-NEXT:    slli a2, a2, 32
+; RV64IM-NEXT:    mulhu a1, a1, a2
+; RV64IM-NEXT:    srli a1, a1, 32
+; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    srliw a0, a0, 1
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    srli a0, a0, 2
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: udiv_constant_add:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    zext.w a1, a0
+; RV64IMZB-NEXT:    lui a2, 149797
+; RV64IMZB-NEXT:    addiw a2, a2, -1755
+; RV64IMZB-NEXT:    mul a1, a1, a2
+; RV64IMZB-NEXT:    srli a1, a1, 32
+; RV64IMZB-NEXT:    subw a0, a0, a1
+; RV64IMZB-NEXT:    srliw a0, a0, 1
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    srli a0, a0, 2
+; RV64IMZB-NEXT:    ret
+  %1 = udiv i32 %a, 7
+  ret i32 %1
+}
+
+define i64 @udiv64_constant_no_add(i64 %a) nounwind {
+; RV32-LABEL: udiv64_constant_no_add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 5
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv64_constant_no_add:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 1035469
+; RV64-NEXT:    addiw a1, a1, -819
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -819
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -819
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -819
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 2
+; RV64-NEXT:    ret
+  %1 = udiv i64 %a, 5
+  ret i64 %1
+}
+
+define i64 @udiv64_constant_add(i64 %a) nounwind {
+; RV32-LABEL: udiv64_constant_add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 7
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv64_constant_add:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 4681
+; RV64-NEXT:    addiw a1, a1, 585
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 585
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 585
+; RV64-NEXT:    slli a1, a1, 13
+; RV64-NEXT:    addi a1, a1, 1171
+; RV64-NEXT:    mulhu a1, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 2
+; RV64-NEXT:    ret
+  %1 = udiv i64 %a, 7
+  ret i64 %1
+}
+
+define i8 @udiv8_constant_no_add(i8 %a) nounwind {
+; RV32-LABEL: udiv8_constant_no_add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 255
+; RV32-NEXT:    li a1, 205
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv8_constant_no_add:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 255
+; RV64-NEXT:    li a1, 205
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 10
+; RV64-NEXT:    ret
+  %1 = udiv i8 %a, 5
+  ret i8 %1
+}
+
+define i8 @udiv8_constant_add(i8 %a) nounwind {
+; RV32IM-LABEL: udiv8_constant_add:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    andi a1, a0, 255
+; RV32IM-NEXT:    li a2, 37
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    srli a1, a1, 8
+; RV32IM-NEXT:    sub a0, a0, a1
+; RV32IM-NEXT:    andi a0, a0, 254
+; RV32IM-NEXT:    srli a0, a0, 1
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    srli a0, a0, 2
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: udiv8_constant_add:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    andi a1, a0, 255
+; RV32IMZB-NEXT:    sh3add a2, a1, a1
+; RV32IMZB-NEXT:    sh2add a1, a2, a1
+; RV32IMZB-NEXT:    srli a1, a1, 8
+; RV32IMZB-NEXT:    sub a0, a0, a1
+; RV32IMZB-NEXT:    andi a0, a0, 254
+; RV32IMZB-NEXT:    srli a0, a0, 1
+; RV32IMZB-NEXT:    add a0, a0, a1
+; RV32IMZB-NEXT:    srli a0, a0, 2
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: udiv8_constant_add:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a1, a0, 255
+; RV64IM-NEXT:    li a2, 37
+; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    srli a1, a1, 8
+; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    andi a0, a0, 254
+; RV64IM-NEXT:    srli a0, a0, 1
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    srli a0, a0, 2
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: udiv8_constant_add:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    andi a1, a0, 255
+; RV64IMZB-NEXT:    sh3add a2, a1, a1
+; RV64IMZB-NEXT:    sh2add a1, a2, a1
+; RV64IMZB-NEXT:    srli a1, a1, 8
+; RV64IMZB-NEXT:    subw a0, a0, a1
+; RV64IMZB-NEXT:    andi a0, a0, 254
+; RV64IMZB-NEXT:    srli a0, a0, 1
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    srli a0, a0, 2
+; RV64IMZB-NEXT:    ret
+  %1 = udiv i8 %a, 7
+  ret i8 %1
+}
+
+define i16 @udiv16_constant_no_add(i16 %a) nounwind {
+; RV32IM-LABEL: udiv16_constant_no_add:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    lui a1, 838864
+; RV32IM-NEXT:    mulhu a0, a0, a1
+; RV32IM-NEXT:    srli a0, a0, 18
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: udiv16_constant_no_add:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    zext.h a0, a0
+; RV32IMZB-NEXT:    lui a1, 13
+; RV32IMZB-NEXT:    addi a1, a1, -819
+; RV32IMZB-NEXT:    mul a0, a0, a1
+; RV32IMZB-NEXT:    srli a0, a0, 18
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: udiv16_constant_no_add:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a1, 52429
+; RV64IM-NEXT:    slli a1, a1, 4
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    mulhu a0, a0, a1
+; RV64IM-NEXT:    srli a0, a0, 18
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: udiv16_constant_no_add:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    zext.h a0, a0
+; RV64IMZB-NEXT:    lui a1, 13
+; RV64IMZB-NEXT:    addiw a1, a1, -819
+; RV64IMZB-NEXT:    mul a0, a0, a1
+; RV64IMZB-NEXT:    srli a0, a0, 18
+; RV64IMZB-NEXT:    ret
+  %1 = udiv i16 %a, 5
+  ret i16 %1
+}
+
+define i16 @udiv16_constant_add(i16 %a) nounwind {
+; RV32IM-LABEL: udiv16_constant_add:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 16
+; RV32IM-NEXT:    lui a2, 149808
+; RV32IM-NEXT:    mulhu a1, a1, a2
+; RV32IM-NEXT:    srli a1, a1, 16
+; RV32IM-NEXT:    sub a0, a0, a1
+; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srli a0, a0, 17
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    srli a0, a0, 2
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: udiv16_constant_add:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    zext.h a1, a0
+; RV32IMZB-NEXT:    lui a2, 2
+; RV32IMZB-NEXT:    addi a2, a2, 1171
+; RV32IMZB-NEXT:    mul a1, a1, a2
+; RV32IMZB-NEXT:    srli a1, a1, 16
+; RV32IMZB-NEXT:    sub a0, a0, a1
+; RV32IMZB-NEXT:    zext.h a0, a0
+; RV32IMZB-NEXT:    srli a0, a0, 1
+; RV32IMZB-NEXT:    add a0, a0, a1
+; RV32IMZB-NEXT:    srli a0, a0, 2
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: udiv16_constant_add:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a0, 48
+; RV64IM-NEXT:    lui a2, 149808
+; RV64IM-NEXT:    mulhu a1, a1, a2
+; RV64IM-NEXT:    srli a1, a1, 16
+; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srli a0, a0, 49
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    srli a0, a0, 2
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: udiv16_constant_add:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    zext.h a1, a0
+; RV64IMZB-NEXT:    lui a2, 2
+; RV64IMZB-NEXT:    addiw a2, a2, 1171
+; RV64IMZB-NEXT:    mul a1, a1, a2
+; RV64IMZB-NEXT:    srli a1, a1, 16
+; RV64IMZB-NEXT:    subw a0, a0, a1
+; RV64IMZB-NEXT:    zext.h a0, a0
+; RV64IMZB-NEXT:    srli a0, a0, 1
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    srli a0, a0, 2
+; RV64IMZB-NEXT:    ret
+  %1 = udiv i16 %a, 7
+  ret i16 %1
+}
+
+; Test the simplest case a srli and an add after the mul. No srai.
+define i32 @sdiv_constant_no_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_no_srai:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1366
+; RV32-NEXT:    mulh a0, a0, a1
+; RV32-NEXT:    srli a1, a0, 31
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sdiv_constant_no_srai:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a1, a1, 1366
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a1, a0, 63
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    addw a0, a0, a1
+; RV64-NEXT:    ret
+  %1 = sdiv i32 %a, 3
+  ret i32 %1
+}
+
+; This constant requires an srai between the mul and the add.
+define i32 @sdiv_constant_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_srai:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 419430
+; RV32-NEXT:    addi a1, a1, 1639
+; RV32-NEXT:    mulh a0, a0, a1
+; RV32-NEXT:    srli a1, a0, 31
+; RV32-NEXT:    srai a0, a0, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sdiv_constant_srai:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    lui a1, 419430
+; RV64-NEXT:    addiw a1, a1, 1639
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a1, a0, 63
+; RV64-NEXT:    srai a0, a0, 33
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %1 = sdiv i32 %a, 5
+  ret i32 %1
+}
+
+; This constant requires an add and an srai after the mul.
+define i32 @sdiv_constant_add_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_add_srai:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 599186
+; RV32-NEXT:    addi a1, a1, 1171
+; RV32-NEXT:    mulh a1, a0, a1
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    srli a1, a0, 31
+; RV32-NEXT:    srai a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sdiv_constant_add_srai:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a0
+; RV64-NEXT:    lui a2, 599186
+; RV64-NEXT:    addiw a2, a2, 1171
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    srliw a1, a0, 31
+; RV64-NEXT:    sraiw a0, a0, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %1 = sdiv i32 %a, 7
+  ret i32 %1
+}
+
+; This constant requires a sub and an srai after the mul.
+define i32 @sdiv_constant_sub_srai(i32 %a) nounwind {
+; RV32-LABEL: sdiv_constant_sub_srai:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 449390
+; RV32-NEXT:    addi a1, a1, -1171
+; RV32-NEXT:    mulh a1, a0, a1
+; RV32-NEXT:    sub a0, a1, a0
+; RV32-NEXT:    srli a1, a0, 31
+; RV32-NEXT:    srai a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sdiv_constant_sub_srai:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a0
+; RV64-NEXT:    lui a2, 449390
+; RV64-NEXT:    addiw a2, a2, -1171
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    subw a0, a1, a0
+; RV64-NEXT:    srliw a1, a0, 31
+; RV64-NEXT:    sraiw a0, a0, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %1 = sdiv i32 %a, -7
+  ret i32 %1
+}
+
+define i64 @sdiv64_constant_no_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_no_srai:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 3
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __divdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sdiv64_constant_no_srai:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 21845
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 1366
+; RV64-NEXT:    mulh a0, a0, a1
+; RV64-NEXT:    srli a1, a0, 63
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %1 = sdiv i64 %a, 3
+  ret i64 %1
+}
+
+define i64 @sdiv64_constant_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_srai:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 5
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __divdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sdiv64_constant_srai:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 13107
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    slli a1, a1, 13
+; RV64-NEXT:    addi a1, a1, 1639
+; RV64-NEXT:    mulh a0, a0, a1
+; RV64-NEXT:    srli a1, a0, 63
+; RV64-NEXT:    srai a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %1 = sdiv i64 %a, 5
+  ret i64 %1
+}
+
+define i64 @sdiv64_constant_add_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_add_srai:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 15
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __divdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sdiv64_constant_add_srai:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 1017993
+; RV64-NEXT:    addiw a1, a1, -1911
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -1911
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -1911
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -1911
+; RV64-NEXT:    mulh a1, a0, a1
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    srli a1, a0, 63
+; RV64-NEXT:    srai a0, a0, 3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %1 = sdiv i64 %a, 15
+  ret i64 %1
+}
+
+define i64 @sdiv64_constant_sub_srai(i64 %a) nounwind {
+; RV32-LABEL: sdiv64_constant_sub_srai:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, -3
+; RV32-NEXT:    li a3, -1
+; RV32-NEXT:    call __divdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sdiv64_constant_sub_srai:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 21845
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, 1365
+; RV64-NEXT:    mulh a1, a0, a1
+; RV64-NEXT:    sub a0, a1, a0
+; RV64-NEXT:    srli a1, a0, 63
+; RV64-NEXT:    srai a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %1 = sdiv i64 %a, -3
+  ret i64 %1
+}
+
+define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_no_srai:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a0, a0, 24
+; RV32IM-NEXT:    srai a0, a0, 24
+; RV32IM-NEXT:    li a1, 86
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    srli a1, a0, 8
+; RV32IM-NEXT:    srli a0, a0, 15
+; RV32IM-NEXT:    andi a0, a0, 1
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_no_srai:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    sext.b a0, a0
+; RV32IMZB-NEXT:    li a1, 86
+; RV32IMZB-NEXT:    mul a0, a0, a1
+; RV32IMZB-NEXT:    srli a1, a0, 8
+; RV32IMZB-NEXT:    srli a0, a0, 15
+; RV32IMZB-NEXT:    andi a0, a0, 1
+; RV32IMZB-NEXT:    add a0, a1, a0
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv8_constant_no_srai:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srai a0, a0, 56
+; RV64IM-NEXT:    li a1, 86
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    srli a1, a0, 8
+; RV64IM-NEXT:    srli a0, a0, 15
+; RV64IM-NEXT:    andi a0, a0, 1
+; RV64IM-NEXT:    add a0, a1, a0
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_no_srai:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    sext.b a0, a0
+; RV64IMZB-NEXT:    li a1, 86
+; RV64IMZB-NEXT:    mul a0, a0, a1
+; RV64IMZB-NEXT:    srli a1, a0, 8
+; RV64IMZB-NEXT:    srli a0, a0, 15
+; RV64IMZB-NEXT:    andi a0, a0, 1
+; RV64IMZB-NEXT:    add a0, a1, a0
+; RV64IMZB-NEXT:    ret
+  %1 = sdiv i8 %a, 3
+  ret i8 %1
+}
+
+define i8 @sdiv8_constant_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_srai:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a0, a0, 24
+; RV32IM-NEXT:    srai a0, a0, 24
+; RV32IM-NEXT:    li a1, 103
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    srai a1, a0, 9
+; RV32IM-NEXT:    srli a0, a0, 15
+; RV32IM-NEXT:    andi a0, a0, 1
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_srai:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    sext.b a0, a0
+; RV32IMZB-NEXT:    li a1, 103
+; RV32IMZB-NEXT:    mul a0, a0, a1
+; RV32IMZB-NEXT:    srai a1, a0, 9
+; RV32IMZB-NEXT:    srli a0, a0, 15
+; RV32IMZB-NEXT:    andi a0, a0, 1
+; RV32IMZB-NEXT:    add a0, a1, a0
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv8_constant_srai:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srai a0, a0, 56
+; RV64IM-NEXT:    li a1, 103
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    srai a1, a0, 9
+; RV64IM-NEXT:    srli a0, a0, 15
+; RV64IM-NEXT:    andi a0, a0, 1
+; RV64IM-NEXT:    add a0, a1, a0
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_srai:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    sext.b a0, a0
+; RV64IMZB-NEXT:    li a1, 103
+; RV64IMZB-NEXT:    mul a0, a0, a1
+; RV64IMZB-NEXT:    srai a1, a0, 9
+; RV64IMZB-NEXT:    srli a0, a0, 15
+; RV64IMZB-NEXT:    andi a0, a0, 1
+; RV64IMZB-NEXT:    add a0, a1, a0
+; RV64IMZB-NEXT:    ret
+  %1 = sdiv i8 %a, 5
+  ret i8 %1
+}
+
+; FIXME: Can shorten the code after the mul by using slli+srai/srli like the
+; i16 version without Zbb.
+define i8 @sdiv8_constant_add_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_add_srai:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 24
+; RV32IM-NEXT:    srai a1, a1, 24
+; RV32IM-NEXT:    li a2, -109
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    srli a1, a1, 8
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    andi a1, a0, 128
+; RV32IM-NEXT:    srli a1, a1, 7
+; RV32IM-NEXT:    slli a0, a0, 24
+; RV32IM-NEXT:    srai a0, a0, 26
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_add_srai:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    sext.b a1, a0
+; RV32IMZB-NEXT:    li a2, -109
+; RV32IMZB-NEXT:    mul a1, a1, a2
+; RV32IMZB-NEXT:    srli a1, a1, 8
+; RV32IMZB-NEXT:    add a0, a1, a0
+; RV32IMZB-NEXT:    andi a1, a0, 128
+; RV32IMZB-NEXT:    srli a1, a1, 7
+; RV32IMZB-NEXT:    sext.b a0, a0
+; RV32IMZB-NEXT:    srai a0, a0, 2
+; RV32IMZB-NEXT:    add a0, a0, a1
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv8_constant_add_srai:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a0, 56
+; RV64IM-NEXT:    srai a1, a1, 56
+; RV64IM-NEXT:    li a2, -109
+; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    srli a1, a1, 8
+; RV64IM-NEXT:    addw a0, a1, a0
+; RV64IM-NEXT:    andi a1, a0, 128
+; RV64IM-NEXT:    srli a1, a1, 7
+; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srai a0, a0, 58
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_add_srai:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    sext.b a1, a0
+; RV64IMZB-NEXT:    li a2, -109
+; RV64IMZB-NEXT:    mul a1, a1, a2
+; RV64IMZB-NEXT:    srli a1, a1, 8
+; RV64IMZB-NEXT:    addw a0, a1, a0
+; RV64IMZB-NEXT:    andi a1, a0, 128
+; RV64IMZB-NEXT:    srli a1, a1, 7
+; RV64IMZB-NEXT:    sext.b a0, a0
+; RV64IMZB-NEXT:    srai a0, a0, 2
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    ret
+  %1 = sdiv i8 %a, 7
+  ret i8 %1
+}
+
+; FIXME: Can shorten the code after the mul by using slli+srai/srli like the
+; i16 version without Zbb.
+define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
+; RV32IM-LABEL: sdiv8_constant_sub_srai:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 24
+; RV32IM-NEXT:    srai a1, a1, 24
+; RV32IM-NEXT:    li a2, 109
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    srli a1, a1, 8
+; RV32IM-NEXT:    sub a0, a1, a0
+; RV32IM-NEXT:    andi a1, a0, 128
+; RV32IM-NEXT:    srli a1, a1, 7
+; RV32IM-NEXT:    slli a0, a0, 24
+; RV32IM-NEXT:    srai a0, a0, 26
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: sdiv8_constant_sub_srai:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    sext.b a1, a0
+; RV32IMZB-NEXT:    li a2, 109
+; RV32IMZB-NEXT:    mul a1, a1, a2
+; RV32IMZB-NEXT:    srli a1, a1, 8
+; RV32IMZB-NEXT:    sub a0, a1, a0
+; RV32IMZB-NEXT:    andi a1, a0, 128
+; RV32IMZB-NEXT:    srli a1, a1, 7
+; RV32IMZB-NEXT:    sext.b a0, a0
+; RV32IMZB-NEXT:    srai a0, a0, 2
+; RV32IMZB-NEXT:    add a0, a0, a1
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv8_constant_sub_srai:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a0, 56
+; RV64IM-NEXT:    srai a1, a1, 56
+; RV64IM-NEXT:    li a2, 109
+; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    srli a1, a1, 8
+; RV64IM-NEXT:    subw a0, a1, a0
+; RV64IM-NEXT:    andi a1, a0, 128
+; RV64IM-NEXT:    srli a1, a1, 7
+; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srai a0, a0, 58
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: sdiv8_constant_sub_srai:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    sext.b a1, a0
+; RV64IMZB-NEXT:    li a2, 109
+; RV64IMZB-NEXT:    mul a1, a1, a2
+; RV64IMZB-NEXT:    srli a1, a1, 8
+; RV64IMZB-NEXT:    subw a0, a1, a0
+; RV64IMZB-NEXT:    andi a1, a0, 128
+; RV64IMZB-NEXT:    srli a1, a1, 7
+; RV64IMZB-NEXT:    sext.b a0, a0
+; RV64IMZB-NEXT:    srai a0, a0, 2
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    ret
+  %1 = sdiv i8 %a, -7
+  ret i8 %1
+}
+
+define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_no_srai:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srai a0, a0, 16
+; RV32IM-NEXT:    lui a1, 5
+; RV32IM-NEXT:    addi a1, a1, 1366
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    srli a1, a0, 31
+; RV32IM-NEXT:    srli a0, a0, 16
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_no_srai:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    sext.h a0, a0
+; RV32IMZB-NEXT:    lui a1, 5
+; RV32IMZB-NEXT:    addi a1, a1, 1366
+; RV32IMZB-NEXT:    mul a0, a0, a1
+; RV32IMZB-NEXT:    srli a1, a0, 31
+; RV32IMZB-NEXT:    srli a0, a0, 16
+; RV32IMZB-NEXT:    add a0, a0, a1
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv16_constant_no_srai:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srai a0, a0, 48
+; RV64IM-NEXT:    lui a1, 5
+; RV64IM-NEXT:    addiw a1, a1, 1366
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    srliw a1, a0, 31
+; RV64IM-NEXT:    srli a0, a0, 16
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_no_srai:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    sext.h a0, a0
+; RV64IMZB-NEXT:    lui a1, 5
+; RV64IMZB-NEXT:    addiw a1, a1, 1366
+; RV64IMZB-NEXT:    mul a0, a0, a1
+; RV64IMZB-NEXT:    srliw a1, a0, 31
+; RV64IMZB-NEXT:    srli a0, a0, 16
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    ret
+  %1 = sdiv i16 %a, 3
+  ret i16 %1
+}
+
+define i16 @sdiv16_constant_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_srai:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srai a0, a0, 16
+; RV32IM-NEXT:    lui a1, 6
+; RV32IM-NEXT:    addi a1, a1, 1639
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    srli a1, a0, 31
+; RV32IM-NEXT:    srai a0, a0, 17
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_srai:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    sext.h a0, a0
+; RV32IMZB-NEXT:    lui a1, 6
+; RV32IMZB-NEXT:    addi a1, a1, 1639
+; RV32IMZB-NEXT:    mul a0, a0, a1
+; RV32IMZB-NEXT:    srli a1, a0, 31
+; RV32IMZB-NEXT:    srai a0, a0, 17
+; RV32IMZB-NEXT:    add a0, a0, a1
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv16_constant_srai:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srai a0, a0, 48
+; RV64IM-NEXT:    lui a1, 6
+; RV64IM-NEXT:    addiw a1, a1, 1639
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    srliw a1, a0, 31
+; RV64IM-NEXT:    srai a0, a0, 17
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_srai:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    sext.h a0, a0
+; RV64IMZB-NEXT:    lui a1, 6
+; RV64IMZB-NEXT:    addiw a1, a1, 1639
+; RV64IMZB-NEXT:    mul a0, a0, a1
+; RV64IMZB-NEXT:    srliw a1, a0, 31
+; RV64IMZB-NEXT:    srai a0, a0, 17
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    ret
+  %1 = sdiv i16 %a, 5
+  ret i16 %1
+}
+
+; FIXME: The Zbb test code has 1 more instruction after the mul because we don't
+; share a slli.
+define i16 @sdiv16_constant_add_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_add_srai:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 16
+; RV32IM-NEXT:    srai a1, a1, 16
+; RV32IM-NEXT:    lui a2, 1048569
+; RV32IM-NEXT:    addi a2, a2, -1911
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    srli a1, a1, 16
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srli a1, a0, 31
+; RV32IM-NEXT:    srai a0, a0, 19
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_add_srai:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    sext.h a1, a0
+; RV32IMZB-NEXT:    lui a2, 1048569
+; RV32IMZB-NEXT:    addi a2, a2, -1911
+; RV32IMZB-NEXT:    mul a1, a1, a2
+; RV32IMZB-NEXT:    srli a1, a1, 16
+; RV32IMZB-NEXT:    add a0, a1, a0
+; RV32IMZB-NEXT:    zext.h a1, a0
+; RV32IMZB-NEXT:    srli a1, a1, 15
+; RV32IMZB-NEXT:    sext.h a0, a0
+; RV32IMZB-NEXT:    srai a0, a0, 3
+; RV32IMZB-NEXT:    add a0, a0, a1
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv16_constant_add_srai:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a0, 48
+; RV64IM-NEXT:    srai a1, a1, 48
+; RV64IM-NEXT:    lui a2, 1048569
+; RV64IM-NEXT:    addiw a2, a2, -1911
+; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    srli a1, a1, 16
+; RV64IM-NEXT:    addw a0, a1, a0
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srli a1, a0, 63
+; RV64IM-NEXT:    srai a0, a0, 51
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_add_srai:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    sext.h a1, a0
+; RV64IMZB-NEXT:    lui a2, 1048569
+; RV64IMZB-NEXT:    addiw a2, a2, -1911
+; RV64IMZB-NEXT:    mul a1, a1, a2
+; RV64IMZB-NEXT:    srli a1, a1, 16
+; RV64IMZB-NEXT:    addw a0, a1, a0
+; RV64IMZB-NEXT:    zext.h a1, a0
+; RV64IMZB-NEXT:    srli a1, a1, 15
+; RV64IMZB-NEXT:    sext.h a0, a0
+; RV64IMZB-NEXT:    srai a0, a0, 3
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    ret
+  %1 = sdiv i16 %a, 15
+  ret i16 %1
+}
+
+; FIXME: The Zbb test code has 1 more instruction after the mul because we don't
+; share a slli.
+define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
+; RV32IM-LABEL: sdiv16_constant_sub_srai:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 16
+; RV32IM-NEXT:    srai a1, a1, 16
+; RV32IM-NEXT:    lui a2, 7
+; RV32IM-NEXT:    addi a2, a2, 1911
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    srli a1, a1, 16
+; RV32IM-NEXT:    sub a0, a1, a0
+; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srli a1, a0, 31
+; RV32IM-NEXT:    srai a0, a0, 19
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: sdiv16_constant_sub_srai:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    sext.h a1, a0
+; RV32IMZB-NEXT:    lui a2, 7
+; RV32IMZB-NEXT:    addi a2, a2, 1911
+; RV32IMZB-NEXT:    mul a1, a1, a2
+; RV32IMZB-NEXT:    srli a1, a1, 16
+; RV32IMZB-NEXT:    sub a0, a1, a0
+; RV32IMZB-NEXT:    zext.h a1, a0
+; RV32IMZB-NEXT:    srli a1, a1, 15
+; RV32IMZB-NEXT:    sext.h a0, a0
+; RV32IMZB-NEXT:    srai a0, a0, 3
+; RV32IMZB-NEXT:    add a0, a0, a1
+; RV32IMZB-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv16_constant_sub_srai:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a0, 48
+; RV64IM-NEXT:    srai a1, a1, 48
+; RV64IM-NEXT:    lui a2, 7
+; RV64IM-NEXT:    addiw a2, a2, 1911
+; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    srli a1, a1, 16
+; RV64IM-NEXT:    subw a0, a1, a0
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srli a1, a0, 63
+; RV64IM-NEXT:    srai a0, a0, 51
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    ret
+;
+; RV64IMZB-LABEL: sdiv16_constant_sub_srai:
+; RV64IMZB:       # %bb.0:
+; RV64IMZB-NEXT:    sext.h a1, a0
+; RV64IMZB-NEXT:    lui a2, 7
+; RV64IMZB-NEXT:    addiw a2, a2, 1911
+; RV64IMZB-NEXT:    mul a1, a1, a2
+; RV64IMZB-NEXT:    srli a1, a1, 16
+; RV64IMZB-NEXT:    subw a0, a1, a0
+; RV64IMZB-NEXT:    zext.h a1, a0
+; RV64IMZB-NEXT:    srli a1, a1, 15
+; RV64IMZB-NEXT:    sext.h a0, a0
+; RV64IMZB-NEXT:    srai a0, a0, 3
+; RV64IMZB-NEXT:    add a0, a0, a1
+; RV64IMZB-NEXT:    ret
+  %1 = sdiv i16 %a, -15
+  ret i16 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
--- a/llvm/test/CodeGen/RISCV/double-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
@@ -1146,26 +1146,30 @@
   ret double %1
 }
 
-declare iXLen @llvm.lrint.iXLen.f64(float)
+declare iXLen @llvm.lrint.iXLen.f64(double)
 
-define iXLen @lrint_f64(float %a) nounwind {
+define iXLen @lrint_f64(double %a) nounwind {
 ; RV32IFD-LABEL: lrint_f64:
 ; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    fmv.w.x ft0, a0
-; RV32IFD-NEXT:    fcvt.w.s a0, ft0
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    fcvt.w.d a0, ft0
+; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: lrint_f64:
 ; RV64IFD:       # %bb.0:
-; RV64IFD-NEXT:    fmv.w.x ft0, a0
-; RV64IFD-NEXT:    fcvt.l.s a0, ft0
+; RV64IFD-NEXT:    fmv.d.x ft0, a0
+; RV64IFD-NEXT:    fcvt.l.d a0, ft0
 ; RV64IFD-NEXT:    ret
 ;
 ; RV32I-LABEL: lrint_f64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    call lrintf@plt
+; RV32I-NEXT:    call lrint@plt
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1174,34 +1178,38 @@
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    call lrintf@plt
+; RV64I-NEXT:    call lrint@plt
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
-  %1 = call iXLen @llvm.lrint.iXLen.f64(float %a)
+  %1 = call iXLen @llvm.lrint.iXLen.f64(double %a)
   ret iXLen %1
 }
 
-declare iXLen @llvm.lround.iXLen.f64(float)
+declare iXLen @llvm.lround.iXLen.f64(double)
 
-define iXLen @lround_f64(float %a) nounwind {
+define iXLen @lround_f64(double %a) nounwind {
 ; RV32IFD-LABEL: lround_f64:
 ; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    fmv.w.x ft0, a0
-; RV32IFD-NEXT:    fcvt.w.s a0, ft0, rmm
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    fcvt.w.d a0, ft0, rmm
+; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: lround_f64:
 ; RV64IFD:       # %bb.0:
-; RV64IFD-NEXT:    fmv.w.x ft0, a0
-; RV64IFD-NEXT:    fcvt.l.s a0, ft0, rmm
+; RV64IFD-NEXT:    fmv.d.x ft0, a0
+; RV64IFD-NEXT:    fcvt.l.d a0, ft0, rmm
 ; RV64IFD-NEXT:    ret
 ;
 ; RV32I-LABEL: lround_f64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    call lroundf@plt
+; RV32I-NEXT:    call lround@plt
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1210,37 +1218,37 @@
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    call lroundf@plt
+; RV64I-NEXT:    call lround@plt
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
-  %1 = call iXLen @llvm.lround.iXLen.f64(float %a)
+  %1 = call iXLen @llvm.lround.iXLen.f64(double %a)
   ret iXLen %1
 }
 
-declare i64 @llvm.llrint.i64.f64(float)
+declare i64 @llvm.llrint.i64.f64(double)
 
-define i64 @llrint_f64(float %a) nounwind {
+define i64 @llrint_f64(double %a) nounwind {
 ; RV32IFD-LABEL: llrint_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    call llrintf@plt
+; RV32IFD-NEXT:    call llrint@plt
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: llrint_f64:
 ; RV64IFD:       # %bb.0:
-; RV64IFD-NEXT:    fmv.w.x ft0, a0
-; RV64IFD-NEXT:    fcvt.l.s a0, ft0
+; RV64IFD-NEXT:    fmv.d.x ft0, a0
+; RV64IFD-NEXT:    fcvt.l.d a0, ft0
 ; RV64IFD-NEXT:    ret
 ;
 ; RV32I-LABEL: llrint_f64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    call llrintf@plt
+; RV32I-NEXT:    call llrint@plt
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1249,37 +1257,37 @@
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    call llrintf@plt
+; RV64I-NEXT:    call llrint@plt
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
-  %1 = call i64 @llvm.llrint.i64.f64(float %a)
+  %1 = call i64 @llvm.llrint.i64.f64(double %a)
   ret i64 %1
 }
 
-declare i64 @llvm.llround.i64.f64(float)
+declare i64 @llvm.llround.i64.f64(double)
 
-define i64 @llround_f64(float %a) nounwind {
+define i64 @llround_f64(double %a) nounwind {
 ; RV32IFD-LABEL: llround_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    call llroundf@plt
+; RV32IFD-NEXT:    call llround@plt
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: llround_f64:
 ; RV64IFD:       # %bb.0:
-; RV64IFD-NEXT:    fmv.w.x ft0, a0
-; RV64IFD-NEXT:    fcvt.l.s a0, ft0, rmm
+; RV64IFD-NEXT:    fmv.d.x ft0, a0
+; RV64IFD-NEXT:    fcvt.l.d a0, ft0, rmm
 ; RV64IFD-NEXT:    ret
 ;
 ; RV32I-LABEL: llround_f64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    call llroundf@plt
+; RV32I-NEXT:    call llround@plt
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1288,10 +1296,10 @@
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    call llroundf@plt
+; RV64I-NEXT:    call llround@plt
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
-  %1 = call i64 @llvm.llround.i64.f64(float %a)
+  %1 = call i64 @llvm.llround.i64.f64(double %a)
   ret i64 %1
 }
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -1967,195 +1967,3 @@
   %1 = call half @llvm.roundeven.f16(half %a)
   ret half %1
 }
-
-declare iXLen @llvm.lrint.iXLen.f16(float)
-
-define iXLen @lrint_f16(float %a) nounwind {
-; RV32IZFH-LABEL: lrint_f16:
-; RV32IZFH:       # %bb.0:
-; RV32IZFH-NEXT:    fcvt.w.s a0, fa0
-; RV32IZFH-NEXT:    ret
-;
-; RV64IZFH-LABEL: lrint_f16:
-; RV64IZFH:       # %bb.0:
-; RV64IZFH-NEXT:    fcvt.l.s a0, fa0
-; RV64IZFH-NEXT:    ret
-;
-; RV32IDZFH-LABEL: lrint_f16:
-; RV32IDZFH:       # %bb.0:
-; RV32IDZFH-NEXT:    fcvt.w.s a0, fa0
-; RV32IDZFH-NEXT:    ret
-;
-; RV64IDZFH-LABEL: lrint_f16:
-; RV64IDZFH:       # %bb.0:
-; RV64IDZFH-NEXT:    fcvt.l.s a0, fa0
-; RV64IDZFH-NEXT:    ret
-;
-; RV32I-LABEL: lrint_f16:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    call lrintf@plt
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: lrint_f16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    call lrintf@plt
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
-; RV64I-NEXT:    ret
-  %1 = call iXLen @llvm.lrint.iXLen.f16(float %a)
-  ret iXLen %1
-}
-
-declare iXLen @llvm.lround.iXLen.f16(float)
-
-define iXLen @lround_f16(float %a) nounwind {
-; RV32IZFH-LABEL: lround_f16:
-; RV32IZFH:       # %bb.0:
-; RV32IZFH-NEXT:    fcvt.w.s a0, fa0, rmm
-; RV32IZFH-NEXT:    ret
-;
-; RV64IZFH-LABEL: lround_f16:
-; RV64IZFH:       # %bb.0:
-; RV64IZFH-NEXT:    fcvt.l.s a0, fa0, rmm
-; RV64IZFH-NEXT:    ret
-;
-; RV32IDZFH-LABEL: lround_f16:
-; RV32IDZFH:       # %bb.0:
-; RV32IDZFH-NEXT:    fcvt.w.s a0, fa0, rmm
-; RV32IDZFH-NEXT:    ret
-;
-; RV64IDZFH-LABEL: lround_f16:
-; RV64IDZFH:       # %bb.0:
-; RV64IDZFH-NEXT:    fcvt.l.s a0, fa0, rmm
-; RV64IDZFH-NEXT:    ret
-;
-; RV32I-LABEL: lround_f16:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    call lroundf@plt
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: lround_f16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    call lroundf@plt
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
-; RV64I-NEXT:    ret
-  %1 = call iXLen @llvm.lround.iXLen.f16(float %a)
-  ret iXLen %1
-}
-
-declare i64 @llvm.llrint.i64.f16(float)
-
-define i64 @llrint_f16(float %a) nounwind {
-; RV32IZFH-LABEL: llrint_f16:
-; RV32IZFH:       # %bb.0:
-; RV32IZFH-NEXT:    addi sp, sp, -16
-; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    call llrintf@plt
-; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    ret
-;
-; RV64IZFH-LABEL: llrint_f16:
-; RV64IZFH:       # %bb.0:
-; RV64IZFH-NEXT:    fcvt.l.s a0, fa0
-; RV64IZFH-NEXT:    ret
-;
-; RV32IDZFH-LABEL: llrint_f16:
-; RV32IDZFH:       # %bb.0:
-; RV32IDZFH-NEXT:    addi sp, sp, -16
-; RV32IDZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    call llrintf@plt
-; RV32IDZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    addi sp, sp, 16
-; RV32IDZFH-NEXT:    ret
-;
-; RV64IDZFH-LABEL: llrint_f16:
-; RV64IDZFH:       # %bb.0:
-; RV64IDZFH-NEXT:    fcvt.l.s a0, fa0
-; RV64IDZFH-NEXT:    ret
-;
-; RV32I-LABEL: llrint_f16:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    call llrintf@plt
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: llrint_f16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    call llrintf@plt
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
-; RV64I-NEXT:    ret
-  %1 = call i64 @llvm.llrint.i64.f16(float %a)
-  ret i64 %1
-}
-
-declare i64 @llvm.llround.i64.f16(float)
-
-define i64 @llround_f16(float %a) nounwind {
-; RV32IZFH-LABEL: llround_f16:
-; RV32IZFH:       # %bb.0:
-; RV32IZFH-NEXT:    addi sp, sp, -16
-; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    call llroundf@plt
-; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    ret
-;
-; RV64IZFH-LABEL: llround_f16:
-; RV64IZFH:       # %bb.0:
-; RV64IZFH-NEXT:    fcvt.l.s a0, fa0, rmm
-; RV64IZFH-NEXT:    ret
-;
-; RV32IDZFH-LABEL: llround_f16:
-; RV32IDZFH:       # %bb.0:
-; RV32IDZFH-NEXT:    addi sp, sp, -16
-; RV32IDZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    call llroundf@plt
-; RV32IDZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    addi sp, sp, 16
-; RV32IDZFH-NEXT:    ret
-;
-; RV64IDZFH-LABEL: llround_f16:
-; RV64IDZFH:       # %bb.0:
-; RV64IDZFH-NEXT:    fcvt.l.s a0, fa0, rmm
-; RV64IDZFH-NEXT:    ret
-;
-; RV32I-LABEL: llround_f16:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    call llroundf@plt
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: llround_f16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    call llroundf@plt
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
-; RV64I-NEXT:    ret
-  %1 = call i64 @llvm.llround.i64.f16(float %a)
-  ret i64 %1
-}
diff --git a/llvm/test/CodeGen/RISCV/rv64f-half-convert-strict.ll b/llvm/test/CodeGen/RISCV/rv64zfh-half-convert-strict.ll
rename from llvm/test/CodeGen/RISCV/rv64f-half-convert-strict.ll
rename to llvm/test/CodeGen/RISCV/rv64zfh-half-convert-strict.ll
diff --git a/llvm/test/CodeGen/RISCV/rv64f-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64zfh-half-convert.ll
rename from llvm/test/CodeGen/RISCV/rv64f-half-convert.ll
rename to llvm/test/CodeGen/RISCV/rv64zfh-half-convert.ll
diff --git a/llvm/test/CodeGen/RISCV/rv64zfh-half-intrinsics.ll b/llvm/test/CodeGen/RISCV/rv64zfh-half-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64zfh-half-intrinsics.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+experimental-zfh \
+; RUN:   -verify-machineinstrs -target-abi lp64f | \
+; RUN:   FileCheck -check-prefix=RV64IZFH %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+d \
+; RUN:   -mattr=+experimental-zfh -verify-machineinstrs -target-abi lp64d | \
+; RUN:   FileCheck -check-prefix=RV64IDZFH %s
+
+; These intrinsics require half and i64 to be legal types.
+
+declare i64 @llvm.llrint.i64.f16(half)
+
+define i64 @llrint_f16(half %a) nounwind {
+; RV64IZFH-LABEL: llrint_f16:
+; RV64IZFH:       # %bb.0:
+; RV64IZFH-NEXT:    fcvt.l.h a0, fa0
+; RV64IZFH-NEXT:    ret
+;
+; RV64IDZFH-LABEL: llrint_f16:
+; RV64IDZFH:       # %bb.0:
+; RV64IDZFH-NEXT:    fcvt.l.h a0, fa0
+; RV64IDZFH-NEXT:    ret
+  %1 = call i64 @llvm.llrint.i64.f16(half %a)
+  ret i64 %1
+}
+
+declare i64 @llvm.llround.i64.f16(half)
+
+define i64 @llround_f16(half %a) nounwind {
+; RV64IZFH-LABEL: llround_f16:
+; RV64IZFH:       # %bb.0:
+; RV64IZFH-NEXT:    fcvt.l.h a0, fa0, rmm
+; RV64IZFH-NEXT:    ret
+;
+; RV64IDZFH-LABEL: llround_f16:
+; RV64IDZFH:       # %bb.0:
+; RV64IDZFH-NEXT:    fcvt.l.h a0, fa0, rmm
+; RV64IDZFH-NEXT:    ret
+  %1 = call i64 @llvm.llround.i64.f16(half %a)
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v < %s 2>&1 | FileCheck %s
+
+define <vscale x 16 x i32> @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <vscale x 16 x i32> %w, <vscale x 16 x i32> %x, <vscale x 16 x i32> %y, <vscale x 16 x i32> %z) {
+; CHECK-LABEL: bar:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a0, 0(sp)
+; CHECK-NEXT:    ld a1, 8(sp)
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    vl8re32.v v0, (a1)
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vadd.vv v16, v16, v0
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %s0 = add <vscale x 16 x i32> %w, %y
+  %s1 = add <vscale x 16 x i32> %x, %z
+  %s = add <vscale x 16 x i32> %s0, %s1
+  ret <vscale x 16 x i32> %s
+}
+
+define <vscale x 16 x i32> @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <vscale x 16 x i32> %x) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -48
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset ra, -8
+; CHECK-NEXT:    csrr t0, vlenb
+; CHECK-NEXT:    slli t0, t0, 4
+; CHECK-NEXT:    sub sp, sp, t0
+; CHECK-NEXT:    addi t0, sp, 40
+; CHECK-NEXT:    sd t0, 8(sp)
+; CHECK-NEXT:    csrr t0, vlenb
+; CHECK-NEXT:    slli t0, t0, 3
+; CHECK-NEXT:    add t0, sp, t0
+; CHECK-NEXT:    addi t0, t0, 40
+; CHECK-NEXT:    sd t0, 0(sp)
+; CHECK-NEXT:    addi t0, sp, 40
+; CHECK-NEXT:    vs8r.v v8, (t0)
+; CHECK-NEXT:    csrr t0, vlenb
+; CHECK-NEXT:    slli t0, t0, 3
+; CHECK-NEXT:    add t0, sp, t0
+; CHECK-NEXT:    addi t0, t0, 40
+; CHECK-NEXT:    vs8r.v v8, (t0)
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    call bar@plt
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 48
+; CHECK-NEXT:    ret
+  %ret = call <vscale x 16 x i32> @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <vscale x 16 x i32> %x, <vscale x 16 x i32> %x, <vscale x 16 x i32> %x, <vscale x 16 x i32> %x)
+  ret <vscale x 16 x i32> %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/unsupported-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/unsupported-calling-conv.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unsupported-calling-conv.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: not --crash llc -mtriple=riscv64 -mattr=+experimental-v < %s 2>&1 | FileCheck %s
-
-; A rather pathological test case in which we exhaust all vector registers and
-; all scalar registers, forcing %z to go through the stack. This is not yet
-; supported, so check that a reasonable error message is produced rather than
-; hitting an assertion or producing incorrect code.
-; CHECK: LLVM ERROR: Unable to pass scalable vector types on the stack
-define <vscale x 16 x i32> @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <vscale x 16 x i32> %x, <vscale x 16 x i32> %y, <vscale x 16 x i32> %z) {
-  %s = add <vscale x 16 x i32> %x, %z
-  ret <vscale x 16 x i32> %s
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir b/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir
--- a/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir
@@ -313,14 +313,14 @@
     ; CHECK-NEXT: $v8_v9_v10_v11_v12_v13_v14_v15 = PseudoVLSEG8E32_V_M1 killed $x12, $noreg, 5, implicit $vl, implicit $vtype
     ; CHECK-NEXT: $x0 = PseudoVSETIVLI 10, 80, implicit-def $vl, implicit-def $vtype
     ; CHECK-NEXT: $v15 = PseudoVLE32_V_M1 killed $x16, $noreg, 5, implicit $vl, implicit $vtype, implicit killed $v8_v9_v10_v11_v12_v13_v14_v15, implicit-def $v8_v9_v10_v11_v12_v13_v14_v15
-    ; CHECK-NEXT: $v24 = PseudoVMV_V_V_M1 killed $v8, $noreg, 5, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: $v25 = PseudoVMV_V_V_M1 killed $v9, $noreg, 5, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: $v26 = PseudoVMV_V_V_M1 killed $v10, $noreg, 5, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: $v27 = PseudoVMV_V_V_M1 killed $v11, $noreg, 5, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: $v28 = PseudoVMV_V_V_M1 killed $v12, $noreg, 5, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: $v29 = PseudoVMV_V_V_M1 killed $v13, $noreg, 5, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: $v30 = PseudoVMV_V_V_M1 killed $v14, $noreg, 5, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: $v31 = PseudoVMV_V_V_M1 killed $v15, $noreg, 5, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v24 = PseudoVMV1R_V killed $v8
+    ; CHECK-NEXT: $v25 = PseudoVMV1R_V killed $v9
+    ; CHECK-NEXT: $v26 = PseudoVMV1R_V killed $v10
+    ; CHECK-NEXT: $v27 = PseudoVMV1R_V killed $v11
+    ; CHECK-NEXT: $v28 = PseudoVMV1R_V killed $v12
+    ; CHECK-NEXT: $v29 = PseudoVMV1R_V killed $v13
+    ; CHECK-NEXT: $v30 = PseudoVMV1R_V killed $v14
+    ; CHECK-NEXT: $v31 = PseudoVMV1R_V killed $v15
     $x0 = PseudoVSETVLI $x14, 80, implicit-def $vl, implicit-def $vtype
     $v8_v9_v10_v11_v12_v13_v14_v15 = PseudoVLSEG8E32_V_M1 killed $x12, $noreg, 5, implicit $vl, implicit $vtype
     $x0 = PseudoVSETIVLI 10, 80, implicit-def $vl, implicit-def $vtype
diff --git a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
--- a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
@@ -259,7 +259,7 @@
 ; RV64IZbb-LABEL: func16:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.h a0, a0
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    sext.h a1, a1
 ; RV64IZbb-NEXT:    add a0, a0, a1
 ; RV64IZbb-NEXT:    lui a1, 8
@@ -335,7 +335,7 @@
 ; RV64IZbb-LABEL: func8:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.b a0, a0
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    sext.b a1, a1
 ; RV64IZbb-NEXT:    add a0, a0, a1
 ; RV64IZbb-NEXT:    li a1, 127
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -244,9 +244,9 @@
 ; RV64-NEXT:    srai a1, a1, 58
 ; RV64-NEXT:    srli a1, a1, 9
 ; RV64-NEXT:    andi a1, a1, 3
-; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    addw a1, a0, a1
 ; RV64-NEXT:    andi a1, a1, 60
-; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    subw a0, a0, a1
 ; RV64-NEXT:    andi a0, a0, 63
 ; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    ret
@@ -270,9 +270,9 @@
 ; RV64M-NEXT:    srai a1, a1, 58
 ; RV64M-NEXT:    srli a1, a1, 9
 ; RV64M-NEXT:    andi a1, a1, 3
-; RV64M-NEXT:    add a1, a0, a1
+; RV64M-NEXT:    addw a1, a0, a1
 ; RV64M-NEXT:    andi a1, a1, 60
-; RV64M-NEXT:    sub a0, a0, a1
+; RV64M-NEXT:    subw a0, a0, a1
 ; RV64M-NEXT:    andi a0, a0, 63
 ; RV64M-NEXT:    snez a0, a0
 ; RV64M-NEXT:    ret
@@ -296,9 +296,9 @@
 ; RV64MV-NEXT:    srai a1, a1, 58
 ; RV64MV-NEXT:    srli a1, a1, 9
 ; RV64MV-NEXT:    andi a1, a1, 3
-; RV64MV-NEXT:    add a1, a0, a1
+; RV64MV-NEXT:    addw a1, a0, a1
 ; RV64MV-NEXT:    andi a1, a1, 60
-; RV64MV-NEXT:    sub a0, a0, a1
+; RV64MV-NEXT:    subw a0, a0, a1
 ; RV64MV-NEXT:    andi a0, a0, 63
 ; RV64MV-NEXT:    snez a0, a0
 ; RV64MV-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
--- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
@@ -258,7 +258,7 @@
 ; RV64IZbb-LABEL: func16:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.h a0, a0
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    sext.h a1, a1
 ; RV64IZbb-NEXT:    sub a0, a0, a1
 ; RV64IZbb-NEXT:    lui a1, 8
@@ -334,7 +334,7 @@
 ; RV64IZbb-LABEL: func8:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.b a0, a0
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    sext.b a1, a1
 ; RV64IZbb-NEXT:    sub a0, a0, a1
 ; RV64IZbb-NEXT:    li a1, 127
diff --git a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
--- a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
@@ -160,7 +160,7 @@
 ; RV64IZbb-LABEL: func16:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    zext.h a0, a0
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    zext.h a1, a1
 ; RV64IZbb-NEXT:    add a0, a0, a1
 ; RV64IZbb-NEXT:    lui a1, 16
@@ -189,7 +189,7 @@
 ; RV64I-LABEL: func8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a0, a0, 255
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    andi a1, a1, 255
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    li a1, 255
@@ -212,7 +212,7 @@
 ; RV64IZbb-LABEL: func8:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    andi a0, a0, 255
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    andi a1, a1, 255
 ; RV64IZbb-NEXT:    add a0, a0, a1
 ; RV64IZbb-NEXT:    li a1, 255
@@ -240,7 +240,7 @@
 ; RV64I-LABEL: func4:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a0, a0, 15
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    andi a1, a1, 15
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    li a1, 15
@@ -263,7 +263,7 @@
 ; RV64IZbb-LABEL: func4:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    andi a0, a0, 15
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    andi a1, a1, 15
 ; RV64IZbb-NEXT:    add a0, a0, a1
 ; RV64IZbb-NEXT:    li a1, 15
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -219,9 +219,9 @@
 ;
 ; RV64-LABEL: test_urem_odd_setne:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a0, 1
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    slliw a1, a0, 1
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    negw a0, a0
 ; RV64-NEXT:    andi a0, a0, 15
 ; RV64-NEXT:    li a1, 3
 ; RV64-NEXT:    sltu a0, a1, a0
@@ -239,9 +239,9 @@
 ;
 ; RV64M-LABEL: test_urem_odd_setne:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    slli a1, a0, 1
-; RV64M-NEXT:    add a0, a1, a0
-; RV64M-NEXT:    neg a0, a0
+; RV64M-NEXT:    slliw a1, a0, 1
+; RV64M-NEXT:    addw a0, a1, a0
+; RV64M-NEXT:    negw a0, a0
 ; RV64M-NEXT:    andi a0, a0, 15
 ; RV64M-NEXT:    li a1, 3
 ; RV64M-NEXT:    sltu a0, a1, a0
@@ -259,9 +259,9 @@
 ;
 ; RV64MV-LABEL: test_urem_odd_setne:
 ; RV64MV:       # %bb.0:
-; RV64MV-NEXT:    slli a1, a0, 1
-; RV64MV-NEXT:    add a0, a1, a0
-; RV64MV-NEXT:    neg a0, a0
+; RV64MV-NEXT:    slliw a1, a0, 1
+; RV64MV-NEXT:    addw a0, a1, a0
+; RV64MV-NEXT:    negw a0, a0
 ; RV64MV-NEXT:    andi a0, a0, 15
 ; RV64MV-NEXT:    li a1, 3
 ; RV64MV-NEXT:    sltu a0, a1, a0
@@ -310,7 +310,7 @@
 ; RV64M-LABEL: test_urem_negative_odd:
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    li a1, 307
-; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    mulw a0, a0, a1
 ; RV64M-NEXT:    andi a0, a0, 511
 ; RV64M-NEXT:    li a1, 1
 ; RV64M-NEXT:    sltu a0, a1, a0
@@ -328,7 +328,7 @@
 ; RV64MV-LABEL: test_urem_negative_odd:
 ; RV64MV:       # %bb.0:
 ; RV64MV-NEXT:    li a1, 307
-; RV64MV-NEXT:    mul a0, a0, a1
+; RV64MV-NEXT:    mulw a0, a0, a1
 ; RV64MV-NEXT:    andi a0, a0, 511
 ; RV64MV-NEXT:    li a1, 1
 ; RV64MV-NEXT:    sltu a0, a1, a0
@@ -427,19 +427,19 @@
 ; RV64-NEXT:    li a1, 819
 ; RV64-NEXT:    mv a0, s1
 ; RV64-NEXT:    call __muldi3@plt
-; RV64-NEXT:    addi a0, a0, -1638
+; RV64-NEXT:    addiw a0, a0, -1638
 ; RV64-NEXT:    andi a0, a0, 2047
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    sltu s1, a1, a0
 ; RV64-NEXT:    li a1, 1463
 ; RV64-NEXT:    mv a0, s2
 ; RV64-NEXT:    call __muldi3@plt
-; RV64-NEXT:    addi a0, a0, -1463
+; RV64-NEXT:    addiw a0, a0, -1463
 ; RV64-NEXT:    andi a0, a0, 2047
 ; RV64-NEXT:    li a1, 292
 ; RV64-NEXT:    sltu a0, a1, a0
-; RV64-NEXT:    neg a1, s3
-; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    negw a1, s3
+; RV64-NEXT:    negw a0, a0
 ; RV64-NEXT:    andi a1, a1, 2047
 ; RV64-NEXT:    andi a0, a0, 2047
 ; RV64-NEXT:    slli a0, a0, 11
@@ -524,19 +524,19 @@
 ; RV64M-NEXT:    li a4, 341
 ; RV64M-NEXT:    sltu a1, a4, a1
 ; RV64M-NEXT:    li a4, 819
-; RV64M-NEXT:    mul a3, a3, a4
-; RV64M-NEXT:    addi a3, a3, -1638
+; RV64M-NEXT:    mulw a3, a3, a4
+; RV64M-NEXT:    addiw a3, a3, -1638
 ; RV64M-NEXT:    andi a3, a3, 2047
 ; RV64M-NEXT:    li a4, 1
 ; RV64M-NEXT:    sltu a3, a4, a3
 ; RV64M-NEXT:    li a4, 1463
-; RV64M-NEXT:    mul a2, a2, a4
-; RV64M-NEXT:    addi a2, a2, -1463
+; RV64M-NEXT:    mulw a2, a2, a4
+; RV64M-NEXT:    addiw a2, a2, -1463
 ; RV64M-NEXT:    andi a2, a2, 2047
 ; RV64M-NEXT:    li a4, 292
 ; RV64M-NEXT:    sltu a2, a4, a2
-; RV64M-NEXT:    neg a1, a1
-; RV64M-NEXT:    neg a2, a2
+; RV64M-NEXT:    negw a1, a1
+; RV64M-NEXT:    negw a2, a2
 ; RV64M-NEXT:    andi a1, a1, 2047
 ; RV64M-NEXT:    andi a2, a2, 2047
 ; RV64M-NEXT:    slli a2, a2, 11
diff --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
--- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
@@ -163,7 +163,7 @@
 ; RV64IZbb-LABEL: func16:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    zext.h a0, a0
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    zext.h a1, a1
 ; RV64IZbb-NEXT:    maxu a0, a0, a1
 ; RV64IZbb-NEXT:    sub a0, a0, a1
@@ -190,7 +190,7 @@
 ; RV64I-LABEL: func8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a3, a0, 255
-; RV64I-NEXT:    mul a0, a1, a2
+; RV64I-NEXT:    mulw a0, a1, a2
 ; RV64I-NEXT:    andi a0, a0, 255
 ; RV64I-NEXT:    sub a1, a3, a0
 ; RV64I-NEXT:    li a0, 0
@@ -212,7 +212,7 @@
 ; RV64IZbb-LABEL: func8:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    andi a0, a0, 255
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    andi a1, a1, 255
 ; RV64IZbb-NEXT:    maxu a0, a0, a1
 ; RV64IZbb-NEXT:    sub a0, a0, a1
@@ -239,7 +239,7 @@
 ; RV64I-LABEL: func4:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a3, a0, 15
-; RV64I-NEXT:    mul a0, a1, a2
+; RV64I-NEXT:    mulw a0, a1, a2
 ; RV64I-NEXT:    andi a0, a0, 15
 ; RV64I-NEXT:    sub a1, a3, a0
 ; RV64I-NEXT:    li a0, 0
@@ -261,7 +261,7 @@
 ; RV64IZbb-LABEL: func4:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    andi a0, a0, 15
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    andi a1, a1, 15
 ; RV64IZbb-NEXT:    maxu a0, a0, a1
 ; RV64IZbb-NEXT:    sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/zfh-half-intrinsics.ll b/llvm/test/CodeGen/RISCV/zfh-half-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zfh-half-intrinsics.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+experimental-zfh \
+; RUN:   -verify-machineinstrs -target-abi ilp32f | \
+; RUN:   FileCheck -check-prefix=RV32IZFH %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+experimental-zfh \
+; RUN:   -verify-machineinstrs -target-abi lp64f | \
+; RUN:   FileCheck -check-prefix=RV64IZFH %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d \
+; RUN:   -mattr=+experimental-zfh -verify-machineinstrs -target-abi ilp32d | \
+; RUN:   FileCheck -check-prefix=RV32IDZFH %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d \
+; RUN:   -mattr=+experimental-zfh -verify-machineinstrs -target-abi lp64d | \
+; RUN:   FileCheck -check-prefix=RV64IDZFH %s
+
+; These intrinsics require half to be a legal type.
+
+declare iXLen @llvm.lrint.iXLen.f16(half)
+
+define iXLen @lrint_f16(half %a) nounwind {
+; RV32IZFH-LABEL: lrint_f16:
+; RV32IZFH:       # %bb.0:
+; RV32IZFH-NEXT:    fcvt.w.h a0, fa0
+; RV32IZFH-NEXT:    ret
+;
+; RV64IZFH-LABEL: lrint_f16:
+; RV64IZFH:       # %bb.0:
+; RV64IZFH-NEXT:    fcvt.l.h a0, fa0
+; RV64IZFH-NEXT:    ret
+;
+; RV32IDZFH-LABEL: lrint_f16:
+; RV32IDZFH:       # %bb.0:
+; RV32IDZFH-NEXT:    fcvt.w.h a0, fa0
+; RV32IDZFH-NEXT:    ret
+;
+; RV64IDZFH-LABEL: lrint_f16:
+; RV64IDZFH:       # %bb.0:
+; RV64IDZFH-NEXT:    fcvt.l.h a0, fa0
+; RV64IDZFH-NEXT:    ret
+  %1 = call iXLen @llvm.lrint.iXLen.f16(half %a)
+  ret iXLen %1
+}
+
+declare iXLen @llvm.lround.iXLen.f16(half)
+
+define iXLen @lround_f16(half %a) nounwind {
+; RV32IZFH-LABEL: lround_f16:
+; RV32IZFH:       # %bb.0:
+; RV32IZFH-NEXT:    fcvt.w.h a0, fa0, rmm
+; RV32IZFH-NEXT:    ret
+;
+; RV64IZFH-LABEL: lround_f16:
+; RV64IZFH:       # %bb.0:
+; RV64IZFH-NEXT:    fcvt.l.h a0, fa0, rmm
+; RV64IZFH-NEXT:    ret
+;
+; RV32IDZFH-LABEL: lround_f16:
+; RV32IDZFH:       # %bb.0:
+; RV32IDZFH-NEXT:    fcvt.w.h a0, fa0, rmm
+; RV32IDZFH-NEXT:    ret
+;
+; RV64IDZFH-LABEL: lround_f16:
+; RV64IDZFH:       # %bb.0:
+; RV64IDZFH-NEXT:    fcvt.l.h a0, fa0, rmm
+; RV64IDZFH-NEXT:    ret
+  %1 = call iXLen @llvm.lround.iXLen.f16(half %a)
+  ret iXLen %1
+}
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -15,19 +15,15 @@
 ; CHECK-NEXT:    adds r6, r0, #1
 ; CHECK-NEXT:    adc r4, r4, #0
 ; CHECK-NEXT:    subs.w r0, lr, #-1
-; CHECK-NEXT:    sbcs r0, r12, #0
 ; CHECK-NEXT:    vmov q1[2], q1[0], lr, r6
-; CHECK-NEXT:    cset r0, lo
+; CHECK-NEXT:    sbcs r0, r12, #0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r12, r4
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov.w r0, #0
-; CHECK-NEXT:    csetm r12, ne
+; CHECK-NEXT:    csetm r12, lo
 ; CHECK-NEXT:    subs.w r6, r6, #-1
-; CHECK-NEXT:    sbcs r6, r4, #0
 ; CHECK-NEXT:    bfi r5, r12, #0, #8
-; CHECK-NEXT:    cset r6, lo
-; CHECK-NEXT:    cmp r6, #0
-; CHECK-NEXT:    csetm r6, ne
+; CHECK-NEXT:    sbcs r6, r4, #0
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    csetm r6, lo
 ; CHECK-NEXT:    bfi r5, r6, #8, #8
 ; CHECK-NEXT:    vmsr p0, r5
 ; CHECK-NEXT:    vpsel q1, q1, q0
@@ -38,17 +34,13 @@
 ; CHECK-NEXT:    subs r1, r6, r1
 ; CHECK-NEXT:    sbcs.w r1, r5, r4
 ; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    cset r1, lo
+; CHECK-NEXT:    csetm r1, lo
 ; CHECK-NEXT:    vldr d1, [sp, #16]
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
 ; CHECK-NEXT:    bfi r0, r1, #0, #8
 ; CHECK-NEXT:    vmov r1, r6, d3
 ; CHECK-NEXT:    subs r1, r1, r5
 ; CHECK-NEXT:    sbcs.w r1, r6, r4
-; CHECK-NEXT:    cset r1, lo
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lo
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    add r0, sp, #24
diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
--- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
@@ -23,37 +23,29 @@
 ; CHECK-NEXT:    subs.w r3, r4, r12
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    subs.w r0, r0, r12
 ; CHECK-NEXT:    sbcs r0, r1, #0
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
+; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r5, r3, #0, #8
-; CHECK-NEXT:    cset r0, lt
 ; CHECK-NEXT:    mov.w r12, #-1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    adr r4, .LCPI0_1
 ; CHECK-NEXT:    bfi r5, r0, #8, #8
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vmsr p0, r5
+; CHECK-NEXT:    adr r4, .LCPI0_1
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    rsbs.w r0, r0, #-2147483648
 ; CHECK-NEXT:    sbcs.w r0, r12, r1
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r2, r0, #0, #8
 ; CHECK-NEXT:    rsbs.w r0, r3, #-2147483648
 ; CHECK-NEXT:    sbcs.w r0, r12, r5
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r2, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -99,18 +91,14 @@
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    cset r3, lo
-; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lo
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r2, r3, #0, #8
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    bfi r2, r0, #8, #8
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vpop {d8, d9}
@@ -141,35 +129,27 @@
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    subs.w r0, r0, #-1
+; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r5, r3, #0, #8
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r5, r0, #8, #8
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    vmsr p0, r5
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    sbcs.w r0, r2, r1
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    rsbs r1, r3, #0
 ; CHECK-NEXT:    sbcs.w r1, r2, r5
 ; CHECK-NEXT:    bfi r2, r0, #0, #8
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r2, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -220,21 +200,17 @@
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r0
 ; CHECK-NEXT:    sbcs r2, r6, #0
 ; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    vmov.i64 q5, #0xffffffff
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r6, r1
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
+; CHECK-NEXT:    vmov.i64 q5, #0xffffffff
 ; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    vmov r0, r4, d8
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r1
 ; CHECK-NEXT:    vmsr p0, r3
+; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    vpsel q6, q0, q5
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    mov r5, r0
@@ -245,15 +221,11 @@
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r0
 ; CHECK-NEXT:    sbcs r2, r6, #0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r6, r1
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r7, r2, #0, #8
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r7, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r7
 ; CHECK-NEXT:    vpsel q0, q0, q5
@@ -368,20 +340,16 @@
 ; CHECK-NEXT:    mov.w r6, #0
 ; CHECK-NEXT:    sbcs.w r2, r6, r5
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov.i32 q5, #0x0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs.w r0, r6, r1
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
 ; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    vpsel q6, q0, q5
 ; CHECK-NEXT:    bl __fixhfdi
@@ -393,15 +361,11 @@
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
 ; CHECK-NEXT:    sbcs.w r2, r6, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    sbcs.w r0, r6, r1
 ; CHECK-NEXT:    bfi r6, r2, #0, #8
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r6, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r6
 ; CHECK-NEXT:    vpsel q0, q0, q5
@@ -450,16 +414,12 @@
 ; CHECK-NEXT:    subs r1, r1, r4
 ; CHECK-NEXT:    sbcs r1, r2, #0
 ; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r2, r1, #0, #8
 ; CHECK-NEXT:    subs r1, r3, r4
 ; CHECK-NEXT:    sbcs r1, r5, #0
 ; CHECK-NEXT:    adr r4, .LCPI9_1
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r2, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -468,15 +428,11 @@
 ; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    subs.w r1, lr, r1
 ; CHECK-NEXT:    sbcs.w r1, r12, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #0, #8
 ; CHECK-NEXT:    subs.w r1, lr, r3
 ; CHECK-NEXT:    sbcs.w r1, r12, r5
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -527,15 +483,11 @@
 ; CHECK-NEXT:    vmov r2, r3, d1
 ; CHECK-NEXT:    subs r0, r0, r4
 ; CHECK-NEXT:    sbcs r0, r1, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r5, r0, #0, #8
 ; CHECK-NEXT:    subs r0, r2, r4
 ; CHECK-NEXT:    sbcs r0, r3, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r5, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r5
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -573,15 +525,11 @@
 ; CHECK-NEXT:    subs r1, r1, r4
 ; CHECK-NEXT:    sbcs r1, r2, #0
 ; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r2, r1, #0, #8
 ; CHECK-NEXT:    subs r1, r3, r4
 ; CHECK-NEXT:    sbcs r1, r5, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r2, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -590,15 +538,11 @@
 ; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    rsbs r1, r1, #0
 ; CHECK-NEXT:    sbcs.w r1, r0, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    rsbs r2, r3, #0
 ; CHECK-NEXT:    sbcs.w r2, r0, r5
 ; CHECK-NEXT:    bfi r0, r1, #0, #8
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1299,37 +1243,29 @@
 ; CHECK-NEXT:    subs.w r3, r4, r12
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    subs.w r0, r0, r12
 ; CHECK-NEXT:    sbcs r0, r1, #0
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
+; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r5, r3, #0, #8
-; CHECK-NEXT:    cset r0, lt
 ; CHECK-NEXT:    mov.w r12, #-1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    adr r4, .LCPI27_1
 ; CHECK-NEXT:    bfi r5, r0, #8, #8
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vmsr p0, r5
+; CHECK-NEXT:    adr r4, .LCPI27_1
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    rsbs.w r0, r0, #-2147483648
 ; CHECK-NEXT:    sbcs.w r0, r12, r1
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r2, r0, #0, #8
 ; CHECK-NEXT:    rsbs.w r0, r3, #-2147483648
 ; CHECK-NEXT:    sbcs.w r0, r12, r5
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r2, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1373,18 +1309,14 @@
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    cset r3, lo
-; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lo
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r2, r3, #0, #8
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    bfi r2, r0, #8, #8
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vpop {d8, d9}
@@ -1414,35 +1346,27 @@
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r0
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r5, r1
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    subs.w r0, r0, #-1
+; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r5, r3, #0, #8
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r5, r0, #8, #8
+; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    vmsr p0, r5
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    sbcs.w r0, r2, r1
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    rsbs r1, r3, #0
 ; CHECK-NEXT:    sbcs.w r1, r2, r5
 ; CHECK-NEXT:    bfi r2, r0, #0, #8
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r2, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1489,21 +1413,17 @@
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r0
 ; CHECK-NEXT:    sbcs r2, r6, #0
 ; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    vmov.i64 q5, #0xffffffff
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], r6, r1
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
+; CHECK-NEXT:    vmov.i64 q5, #0xffffffff
 ; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    vmov r0, r4, d8
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r1
 ; CHECK-NEXT:    vmsr p0, r3
+; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    vpsel q6, q0, q5
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    mov r5, r0
@@ -1514,15 +1434,11 @@
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r0
 ; CHECK-NEXT:    sbcs r2, r6, #0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r6, r1
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r7, r2, #0, #8
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r7, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r7
 ; CHECK-NEXT:    vpsel q0, q0, q5
@@ -1631,20 +1547,16 @@
 ; CHECK-NEXT:    mov.w r6, #0
 ; CHECK-NEXT:    sbcs.w r2, r6, r5
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    vmov.i32 q5, #0x0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs.w r0, r6, r1
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
 ; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    vpsel q6, q0, q5
 ; CHECK-NEXT:    bl __fixhfdi
@@ -1656,15 +1568,11 @@
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
 ; CHECK-NEXT:    sbcs.w r2, r6, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r1
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    sbcs.w r0, r6, r1
 ; CHECK-NEXT:    bfi r6, r2, #0, #8
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r6, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r6
 ; CHECK-NEXT:    vpsel q0, q0, q5
@@ -1711,16 +1619,12 @@
 ; CHECK-NEXT:    subs r1, r1, r4
 ; CHECK-NEXT:    sbcs r1, r2, #0
 ; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r2, r1, #0, #8
 ; CHECK-NEXT:    subs r1, r3, r4
 ; CHECK-NEXT:    sbcs r1, r5, #0
 ; CHECK-NEXT:    adr r4, .LCPI36_1
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r2, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1729,15 +1633,11 @@
 ; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    subs.w r1, lr, r1
 ; CHECK-NEXT:    sbcs.w r1, r12, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #0, #8
 ; CHECK-NEXT:    subs.w r1, lr, r3
 ; CHECK-NEXT:    sbcs.w r1, r12, r5
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1786,15 +1686,11 @@
 ; CHECK-NEXT:    vmov r2, r3, d1
 ; CHECK-NEXT:    subs r0, r0, r4
 ; CHECK-NEXT:    sbcs r0, r1, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r5, r0, #0, #8
 ; CHECK-NEXT:    subs r0, r2, r4
 ; CHECK-NEXT:    sbcs r0, r3, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r5, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r5
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1831,15 +1727,11 @@
 ; CHECK-NEXT:    subs r1, r1, r4
 ; CHECK-NEXT:    sbcs r1, r2, #0
 ; CHECK-NEXT:    mov.w r2, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r2, r1, #0, #8
 ; CHECK-NEXT:    subs r1, r3, r4
 ; CHECK-NEXT:    sbcs r1, r5, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r2, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r2
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1848,15 +1740,11 @@
 ; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    rsbs r1, r1, #0
 ; CHECK-NEXT:    sbcs.w r1, r0, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    rsbs r2, r3, #0
 ; CHECK-NEXT:    sbcs.w r2, r0, r5
 ; CHECK-NEXT:    bfi r0, r1, #0, #8
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpsel q0, q0, q1
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -95,55 +95,47 @@
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    ldrd r12, lr, [r1]
 ; CHECK-LE-NEXT:    movs r3, #0
-; CHECK-LE-NEXT:    @ implicit-def: $q0
+; CHECK-LE-NEXT:    @ implicit-def: $q1
 ; CHECK-LE-NEXT:    rsbs.w r1, r12, #0
-; CHECK-LE-NEXT:    vmov q1[2], q1[0], r12, lr
+; CHECK-LE-NEXT:    vmov q0[2], q0[0], r12, lr
 ; CHECK-LE-NEXT:    sbcs.w r1, r3, r12, asr #31
-; CHECK-LE-NEXT:    cset r1, lt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    csetm r1, ne
+; CHECK-LE-NEXT:    csetm r1, lt
 ; CHECK-LE-NEXT:    rsbs.w r4, lr, #0
 ; CHECK-LE-NEXT:    sbcs.w r4, r3, lr, asr #31
 ; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
-; CHECK-LE-NEXT:    cset r1, lt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    csetm r1, ne
+; CHECK-LE-NEXT:    csetm r1, lt
 ; CHECK-LE-NEXT:    bfi r3, r1, #1, #1
 ; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    itt ne
 ; CHECK-LE-NEXT:    ldrne r1, [r2]
-; CHECK-LE-NEXT:    vmovne.32 q0[0], r1
+; CHECK-LE-NEXT:    vmovne.32 q1[0], r1
 ; CHECK-LE-NEXT:    lsls r1, r3, #30
 ; CHECK-LE-NEXT:    itt mi
 ; CHECK-LE-NEXT:    ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT:    vmovmi.32 q0[2], r1
-; CHECK-LE-NEXT:    vmov r2, s2
+; CHECK-LE-NEXT:    vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT:    vmov r2, s6
 ; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vmov r3, s4
-; CHECK-LE-NEXT:    vmov r4, s0
-; CHECK-LE-NEXT:    vmov q0[2], q0[0], r4, r2
+; CHECK-LE-NEXT:    vmov r3, s0
+; CHECK-LE-NEXT:    vmov r4, s4
+; CHECK-LE-NEXT:    vmov q1[2], q1[0], r4, r2
 ; CHECK-LE-NEXT:    rsbs r5, r3, #0
 ; CHECK-LE-NEXT:    asr.w r12, r2, #31
 ; CHECK-LE-NEXT:    sbcs.w r2, r1, r3, asr #31
-; CHECK-LE-NEXT:    vmov r3, s6
-; CHECK-LE-NEXT:    cset r2, lt
+; CHECK-LE-NEXT:    vmov r3, s2
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    asr.w lr, r4, #31
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    vmov q0[3], q0[1], lr, r12
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    vmov q1[3], q1[1], lr, r12
 ; CHECK-LE-NEXT:    rsbs r5, r3, #0
 ; CHECK-LE-NEXT:    sbcs.w r3, r1, r3, asr #31
 ; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    it ne
-; CHECK-LE-NEXT:    vstrne d0, [r0]
+; CHECK-LE-NEXT:    vstrne d2, [r0]
 ; CHECK-LE-NEXT:    lsls r1, r1, #30
 ; CHECK-LE-NEXT:    it mi
-; CHECK-LE-NEXT:    vstrmi d1, [r0, #8]
+; CHECK-LE-NEXT:    vstrmi d3, [r0, #8]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r4, r5, r7, pc}
 ;
@@ -157,17 +149,13 @@
 ; CHECK-BE-NEXT:    rsbs.w r3, lr, #0
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, lr, asr #31
-; CHECK-BE-NEXT:    cset r3, lt
 ; CHECK-BE-NEXT:    vmov q0[3], q0[1], r12, lr
-; CHECK-BE-NEXT:    cmp r3, #0
-; CHECK-BE-NEXT:    @ implicit-def: $q2
-; CHECK-BE-NEXT:    csetm lr, ne
+; CHECK-BE-NEXT:    csetm lr, lt
 ; CHECK-BE-NEXT:    rsbs.w r3, r12, #0
+; CHECK-BE-NEXT:    @ implicit-def: $q2
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r12, asr #31
 ; CHECK-BE-NEXT:    bfi r1, lr, #0, #1
-; CHECK-BE-NEXT:    cset r3, lt
-; CHECK-BE-NEXT:    cmp r3, #0
-; CHECK-BE-NEXT:    csetm r3, ne
+; CHECK-BE-NEXT:    csetm r3, lt
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB5_2
@@ -198,16 +186,12 @@
 ; CHECK-BE-NEXT:    sbcs.w r4, r1, r4, asr #31
 ; CHECK-BE-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-BE-NEXT:    vmov r3, s9
-; CHECK-BE-NEXT:    cset r2, lt
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
 ; CHECK-BE-NEXT:    rsbs r5, r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r3, asr #31
 ; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    it mi
@@ -239,15 +223,11 @@
 ; CHECK-LE-NEXT:    rsbs.w r1, r12, #0
 ; CHECK-LE-NEXT:    vmov q1[2], q1[0], r12, lr
 ; CHECK-LE-NEXT:    sbcs.w r1, r3, r12, asr #31
-; CHECK-LE-NEXT:    cset r1, lt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    csetm r1, ne
+; CHECK-LE-NEXT:    csetm r1, lt
 ; CHECK-LE-NEXT:    rsbs.w r4, lr, #0
 ; CHECK-LE-NEXT:    sbcs.w r4, r3, lr, asr #31
 ; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
-; CHECK-LE-NEXT:    cset r1, lt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    csetm r1, ne
+; CHECK-LE-NEXT:    csetm r1, lt
 ; CHECK-LE-NEXT:    bfi r3, r1, #1, #1
 ; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    itt ne
@@ -266,17 +246,13 @@
 ; CHECK-LE-NEXT:    asr.w r12, r2, #31
 ; CHECK-LE-NEXT:    sbcs.w r2, r1, r3, asr #31
 ; CHECK-LE-NEXT:    vmov r3, s6
-; CHECK-LE-NEXT:    cset r2, lt
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    asr.w lr, r4, #31
-; CHECK-LE-NEXT:    cmp r2, #0
 ; CHECK-LE-NEXT:    vmov q0[3], q0[1], lr, r12
-; CHECK-LE-NEXT:    csetm r2, ne
 ; CHECK-LE-NEXT:    rsbs r5, r3, #0
 ; CHECK-LE-NEXT:    sbcs.w r3, r1, r3, asr #31
 ; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
@@ -299,17 +275,13 @@
 ; CHECK-BE-NEXT:    rsbs.w r3, lr, #0
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, lr, asr #31
-; CHECK-BE-NEXT:    cset r3, lt
 ; CHECK-BE-NEXT:    vmov q0[3], q0[1], r12, lr
-; CHECK-BE-NEXT:    cmp r3, #0
-; CHECK-BE-NEXT:    @ implicit-def: $q2
-; CHECK-BE-NEXT:    csetm lr, ne
+; CHECK-BE-NEXT:    csetm lr, lt
 ; CHECK-BE-NEXT:    rsbs.w r3, r12, #0
+; CHECK-BE-NEXT:    @ implicit-def: $q2
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r12, asr #31
 ; CHECK-BE-NEXT:    bfi r1, lr, #0, #1
-; CHECK-BE-NEXT:    cset r3, lt
-; CHECK-BE-NEXT:    cmp r3, #0
-; CHECK-BE-NEXT:    csetm r3, ne
+; CHECK-BE-NEXT:    csetm r3, lt
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB6_2
@@ -340,16 +312,12 @@
 ; CHECK-BE-NEXT:    sbcs.w r4, r1, r4, asr #31
 ; CHECK-BE-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-BE-NEXT:    vmov r3, s9
-; CHECK-BE-NEXT:    cset r2, lt
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
 ; CHECK-BE-NEXT:    rsbs r5, r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r3, asr #31
 ; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
@@ -384,15 +352,11 @@
 ; CHECK-LE-NEXT:    rsbs.w r1, r12, #0
 ; CHECK-LE-NEXT:    vmov q1[2], q1[0], r12, lr
 ; CHECK-LE-NEXT:    sbcs.w r1, r3, r12, asr #31
-; CHECK-LE-NEXT:    cset r1, lt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    csetm r1, ne
+; CHECK-LE-NEXT:    csetm r1, lt
 ; CHECK-LE-NEXT:    rsbs.w r4, lr, #0
 ; CHECK-LE-NEXT:    sbcs.w r4, r3, lr, asr #31
 ; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
-; CHECK-LE-NEXT:    cset r1, lt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    csetm r1, ne
+; CHECK-LE-NEXT:    csetm r1, lt
 ; CHECK-LE-NEXT:    bfi r3, r1, #1, #1
 ; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    itt ne
@@ -408,15 +372,11 @@
 ; CHECK-LE-NEXT:    rsbs r3, r2, #0
 ; CHECK-LE-NEXT:    vmov r3, s6
 ; CHECK-LE-NEXT:    sbcs.w r2, r1, r2, asr #31
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    rsbs r4, r3, #0
 ; CHECK-LE-NEXT:    sbcs.w r3, r1, r3, asr #31
 ; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    it ne
@@ -437,51 +397,43 @@
 ; CHECK-BE-NEXT:    rsbs.w r3, lr, #0
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, lr, asr #31
-; CHECK-BE-NEXT:    cset r3, lt
-; CHECK-BE-NEXT:    vmov q0[3], q0[1], r12, lr
-; CHECK-BE-NEXT:    cmp r3, #0
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    csetm lr, ne
+; CHECK-BE-NEXT:    vmov q1[3], q1[1], r12, lr
+; CHECK-BE-NEXT:    csetm lr, lt
 ; CHECK-BE-NEXT:    rsbs.w r3, r12, #0
+; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r12, asr #31
 ; CHECK-BE-NEXT:    bfi r1, lr, #0, #1
-; CHECK-BE-NEXT:    cset r3, lt
-; CHECK-BE-NEXT:    cmp r3, #0
-; CHECK-BE-NEXT:    csetm r3, ne
+; CHECK-BE-NEXT:    csetm r3, lt
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB7_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-BE-NEXT:    ldr r3, [r2]
 ; CHECK-BE-NEXT:    vmov.32 q2[1], r3
-; CHECK-BE-NEXT:    vrev64.32 q1, q2
+; CHECK-BE-NEXT:    vrev64.32 q0, q2
 ; CHECK-BE-NEXT:  .LBB7_2: @ %else
-; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB7_4
 ; CHECK-BE-NEXT:  @ %bb.3: @ %cond.load1
 ; CHECK-BE-NEXT:    ldr r1, [r2, #4]
-; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    vmov.32 q0[3], r1
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    vmov.32 q1[3], r1
+; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:  .LBB7_4: @ %else2
 ; CHECK-BE-NEXT:    vrev64.32 q3, q2
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vmov r2, s15
-; CHECK-BE-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-BE-NEXT:    vand q0, q1, q0
+; CHECK-BE-NEXT:    vmov.i64 q1, #0xffffffff
+; CHECK-BE-NEXT:    vand q0, q0, q1
 ; CHECK-BE-NEXT:    rsbs r3, r2, #0
 ; CHECK-BE-NEXT:    vmov r3, s13
 ; CHECK-BE-NEXT:    sbcs.w r2, r1, r2, asr #31
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r12, ne
+; CHECK-BE-NEXT:    csetm r12, lt
 ; CHECK-BE-NEXT:    rsbs r2, r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r1, r3, asr #31
 ; CHECK-BE-NEXT:    bfi r1, r12, #0, #1
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    it mi
@@ -514,15 +466,11 @@
 ; CHECK-LE-NEXT:    rsbs.w r1, r12, #0
 ; CHECK-LE-NEXT:    vmov q1[2], q1[0], r12, lr
 ; CHECK-LE-NEXT:    sbcs.w r1, r3, r12, asr #31
-; CHECK-LE-NEXT:    cset r1, lt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    csetm r1, ne
+; CHECK-LE-NEXT:    csetm r1, lt
 ; CHECK-LE-NEXT:    rsbs.w r4, lr, #0
 ; CHECK-LE-NEXT:    sbcs.w r4, r3, lr, asr #31
 ; CHECK-LE-NEXT:    bfi r3, r1, #0, #1
-; CHECK-LE-NEXT:    cset r1, lt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    csetm r1, ne
+; CHECK-LE-NEXT:    csetm r1, lt
 ; CHECK-LE-NEXT:    bfi r3, r1, #1, #1
 ; CHECK-LE-NEXT:    lsls r1, r3, #31
 ; CHECK-LE-NEXT:    itt ne
@@ -538,15 +486,11 @@
 ; CHECK-LE-NEXT:    rsbs r3, r2, #0
 ; CHECK-LE-NEXT:    vmov r3, s6
 ; CHECK-LE-NEXT:    sbcs.w r2, r1, r2, asr #31
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    rsbs r4, r3, #0
 ; CHECK-LE-NEXT:    sbcs.w r3, r1, r3, asr #31
 ; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
@@ -569,51 +513,43 @@
 ; CHECK-BE-NEXT:    rsbs.w r3, lr, #0
 ; CHECK-BE-NEXT:    mov.w r1, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, lr, asr #31
-; CHECK-BE-NEXT:    cset r3, lt
-; CHECK-BE-NEXT:    vmov q0[3], q0[1], r12, lr
-; CHECK-BE-NEXT:    cmp r3, #0
-; CHECK-BE-NEXT:    @ implicit-def: $q1
-; CHECK-BE-NEXT:    csetm lr, ne
+; CHECK-BE-NEXT:    vmov q1[3], q1[1], r12, lr
+; CHECK-BE-NEXT:    csetm lr, lt
 ; CHECK-BE-NEXT:    rsbs.w r3, r12, #0
+; CHECK-BE-NEXT:    @ implicit-def: $q0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r12, asr #31
 ; CHECK-BE-NEXT:    bfi r1, lr, #0, #1
-; CHECK-BE-NEXT:    cset r3, lt
-; CHECK-BE-NEXT:    cmp r3, #0
-; CHECK-BE-NEXT:    csetm r3, ne
+; CHECK-BE-NEXT:    csetm r3, lt
 ; CHECK-BE-NEXT:    bfi r1, r3, #1, #1
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB8_2
 ; CHECK-BE-NEXT:  @ %bb.1: @ %cond.load
 ; CHECK-BE-NEXT:    ldr r3, [r2]
 ; CHECK-BE-NEXT:    vmov.32 q2[1], r3
-; CHECK-BE-NEXT:    vrev64.32 q1, q2
+; CHECK-BE-NEXT:    vrev64.32 q0, q2
 ; CHECK-BE-NEXT:  .LBB8_2: @ %else
-; CHECK-BE-NEXT:    vrev64.32 q2, q0
+; CHECK-BE-NEXT:    vrev64.32 q2, q1
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB8_4
 ; CHECK-BE-NEXT:  @ %bb.3: @ %cond.load1
 ; CHECK-BE-NEXT:    ldr r1, [r2, #4]
-; CHECK-BE-NEXT:    vrev64.32 q0, q1
-; CHECK-BE-NEXT:    vmov.32 q0[3], r1
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    vmov.32 q1[3], r1
+; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:  .LBB8_4: @ %else2
 ; CHECK-BE-NEXT:    vrev64.32 q3, q2
 ; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vmov r2, s15
-; CHECK-BE-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-BE-NEXT:    vand q0, q1, q0
+; CHECK-BE-NEXT:    vmov.i64 q1, #0xffffffff
+; CHECK-BE-NEXT:    vand q0, q0, q1
 ; CHECK-BE-NEXT:    rsbs r3, r2, #0
 ; CHECK-BE-NEXT:    vmov r3, s13
 ; CHECK-BE-NEXT:    sbcs.w r2, r1, r2, asr #31
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r12, ne
+; CHECK-BE-NEXT:    csetm r12, lt
 ; CHECK-BE-NEXT:    rsbs r2, r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r1, r3, asr #31
 ; CHECK-BE-NEXT:    bfi r1, r12, #0, #1
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -1759,15 +1759,11 @@
 ; CHECK-LE-NEXT:    vmov r12, lr, d1
 ; CHECK-LE-NEXT:    rsbs r2, r2, #0
 ; CHECK-LE-NEXT:    sbcs.w r2, r1, r3
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
 ; CHECK-LE-NEXT:    sbcs.w r3, r1, lr
 ; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    beq .LBB49_2
@@ -1801,15 +1797,11 @@
 ; CHECK-BE-NEXT:    vmov r12, lr, d2
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r1, r2
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    rsbs.w r3, lr, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r12
 ; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB49_2
@@ -1848,15 +1840,11 @@
 ; CHECK-LE-NEXT:    vmov r12, lr, d3
 ; CHECK-LE-NEXT:    rsbs r2, r2, #0
 ; CHECK-LE-NEXT:    sbcs.w r2, r1, r3
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
 ; CHECK-LE-NEXT:    sbcs.w r3, r1, lr
 ; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    beq .LBB50_2
@@ -1890,15 +1878,11 @@
 ; CHECK-BE-NEXT:    vmov r12, lr, d0
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r1, r2
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    rsbs.w r3, lr, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r12
 ; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB50_2
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
--- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
@@ -944,15 +944,11 @@
 ; CHECK-LE-NEXT:    vmov r12, lr, d1
 ; CHECK-LE-NEXT:    rsbs r2, r2, #0
 ; CHECK-LE-NEXT:    sbcs.w r2, r1, r3
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
 ; CHECK-LE-NEXT:    sbcs.w r3, r1, lr
 ; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    it ne
@@ -975,15 +971,11 @@
 ; CHECK-BE-NEXT:    vmov r12, lr, d2
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r1, r2
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    rsbs.w r3, lr, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r12
 ; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    it mi
@@ -1011,15 +1003,11 @@
 ; CHECK-LE-NEXT:    vmov r12, lr, d3
 ; CHECK-LE-NEXT:    rsbs r2, r2, #0
 ; CHECK-LE-NEXT:    sbcs.w r2, r1, r3
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    rsbs.w r3, r12, #0
 ; CHECK-LE-NEXT:    sbcs.w r3, r1, lr
 ; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-LE-NEXT:    cset r2, lt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, lt
 ; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    it ne
@@ -1042,15 +1030,11 @@
 ; CHECK-BE-NEXT:    vmov r12, lr, d4
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    sbcs.w r2, r1, r2
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    rsbs.w r3, lr, #0
 ; CHECK-BE-NEXT:    sbcs.w r3, r1, r12
 ; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
-; CHECK-BE-NEXT:    cset r2, lt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, lt
 ; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    it mi
@@ -1216,33 +1200,25 @@
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.f32 s0, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    movs r1, #0
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-LE-NEXT:    vcmp.f32 s1, #0
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
-; CHECK-LE-NEXT:    cset r1, gt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    mov.w r1, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
 ; CHECK-LE-NEXT:    vcmp.f32 s2, #0
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    vcmp.f32 s3, #0
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-LE-NEXT:    bfi r1, r2, #2, #1
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB25_5
@@ -1282,6 +1258,7 @@
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1289,27 +1266,18 @@
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
-; CHECK-BE-NEXT:    cset r1, gt
-; CHECK-BE-NEXT:    cmp r1, #0
-; CHECK-BE-NEXT:    mov.w r1, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
 ; CHECK-BE-NEXT:    vcmp.f32 s5, #0
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    vcmp.f32 s4, #0
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    bfi r1, r2, #2, #1
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB25_5
@@ -1356,33 +1324,25 @@
 ; CHECK-LE-NEXT:    .pad #4
 ; CHECK-LE-NEXT:    sub sp, #4
 ; CHECK-LE-NEXT:    vcmp.f32 s0, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    movs r1, #0
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-LE-NEXT:    vcmp.f32 s1, #0
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
-; CHECK-LE-NEXT:    cset r1, gt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    mov.w r1, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
 ; CHECK-LE-NEXT:    vcmp.f32 s2, #0
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    vcmp.f32 s3, #0
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-LE-NEXT:    bfi r1, r2, #2, #1
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB26_5
@@ -1422,6 +1382,7 @@
 ; CHECK-BE-NEXT:    .pad #4
 ; CHECK-BE-NEXT:    sub sp, #4
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1429,27 +1390,18 @@
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
-; CHECK-BE-NEXT:    cset r1, gt
-; CHECK-BE-NEXT:    cmp r1, #0
-; CHECK-BE-NEXT:    mov.w r1, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
 ; CHECK-BE-NEXT:    vcmp.f32 s5, #0
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    vcmp.f32 s4, #0
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    bfi r1, r2, #2, #1
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB26_5
@@ -1496,33 +1448,25 @@
 ; CHECK-LE-NEXT:    .pad #20
 ; CHECK-LE-NEXT:    sub sp, #20
 ; CHECK-LE-NEXT:    vcmp.f32 s0, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    movs r1, #0
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-LE-NEXT:    vcmp.f32 s1, #0
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
-; CHECK-LE-NEXT:    cset r1, gt
-; CHECK-LE-NEXT:    cmp r1, #0
-; CHECK-LE-NEXT:    mov.w r1, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
 ; CHECK-LE-NEXT:    vcmp.f32 s2, #0
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    bfi r1, r2, #0, #1
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-LE-NEXT:    vcmp.f32 s3, #0
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    bfi r1, r2, #1, #1
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-LE-NEXT:    bfi r1, r2, #2, #1
-; CHECK-LE-NEXT:    cset r2, gt
-; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    csetm r2, ne
+; CHECK-LE-NEXT:    csetm r2, gt
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB27_5
@@ -1570,6 +1514,7 @@
 ; CHECK-BE-NEXT:    .pad #20
 ; CHECK-BE-NEXT:    sub sp, #20
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
+; CHECK-BE-NEXT:    movs r1, #0
 ; CHECK-BE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1577,27 +1522,18 @@
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
 ; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
-; CHECK-BE-NEXT:    cset r1, gt
-; CHECK-BE-NEXT:    cmp r1, #0
-; CHECK-BE-NEXT:    mov.w r1, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
 ; CHECK-BE-NEXT:    vcmp.f32 s5, #0
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    bfi r1, r2, #0, #1
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-BE-NEXT:    vcmp.f32 s4, #0
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    bfi r1, r2, #1, #1
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-BE-NEXT:    bfi r1, r2, #2, #1
-; CHECK-BE-NEXT:    cset r2, gt
-; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    csetm r2, ne
+; CHECK-BE-NEXT:    csetm r2, gt
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB27_5
diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -47,15 +47,11 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    subs.w r0, r4, r12
 ; CHECK-NEXT:    sbcs.w r0, r5, lr
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -111,15 +107,11 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    subs.w r0, r4, r12
 ; CHECK-NEXT:    sbcs.w r0, r5, lr
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -176,15 +168,11 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    subs.w r0, r4, r12
 ; CHECK-NEXT:    sbcs.w r0, r5, lr
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -240,15 +228,11 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    subs.w r0, r4, r12
 ; CHECK-NEXT:    sbcs.w r0, r5, lr
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -329,18 +313,12 @@
 ; CHECK-NEXT:    vmov r12, r1, d9
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    vmov r2, r3, d11
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, #1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    bfi r4, r0, #0, #8
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, #1
-; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    bfi r4, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r4
diff --git a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll
--- a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll
@@ -153,21 +153,17 @@
 ; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    sbcs.w r1, lr, r3, asr #31
 ; CHECK-NEXT:    asr.w r5, r3, #31
-; CHECK-NEXT:    cset r1, lt
 ; CHECK-NEXT:    asr.w r12, r0, #31
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs.w r0, r12, r2, asr #31
 ; CHECK-NEXT:    bfi r3, r1, #0, #8
-; CHECK-NEXT:    cset r0, lt
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    asrs r4, r2, #31
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    vmov q1[3], q1[1], lr, r12
-; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
-; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -233,17 +229,13 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lt
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    vmov r3, r2, d3
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r12, d1
 ; CHECK-NEXT:    subs r0, r3, r0
 ; CHECK-NEXT:    sbcs.w r0, r2, r12
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -265,17 +257,13 @@
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs.w r1, r2, r12
 ; CHECK-NEXT:    vmov lr, r12, d3
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
 ; CHECK-NEXT:    vmov r2, r4, d7
 ; CHECK-NEXT:    subs.w r2, r2, lr
 ; CHECK-NEXT:    sbcs.w r2, r4, r12
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    bfi r3, r2, #8, #8
 ; CHECK-NEXT:    vmov r2, r12, d0
 ; CHECK-NEXT:    vmsr p0, r3
@@ -285,16 +273,12 @@
 ; CHECK-NEXT:    subs r2, r4, r2
 ; CHECK-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-NEXT:    vmov r4, r3, d5
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    bfi r1, r2, #0, #8
 ; CHECK-NEXT:    vmov r2, r12, d1
 ; CHECK-NEXT:    subs r2, r4, r2
 ; CHECK-NEXT:    sbcs.w r2, r3, r12
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    bfi r1, r2, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q2
@@ -453,17 +437,13 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lo
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    vmov r3, r2, d3
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r12, d1
 ; CHECK-NEXT:    subs r0, r3, r0
 ; CHECK-NEXT:    sbcs.w r0, r2, r12
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -530,17 +510,13 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lo
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    vmov r3, r2, d3
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r12, d1
 ; CHECK-NEXT:    subs r0, r3, r0
 ; CHECK-NEXT:    sbcs.w r0, r2, r12
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -562,17 +538,13 @@
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs.w r1, r2, r12
 ; CHECK-NEXT:    vmov lr, r12, d3
-; CHECK-NEXT:    cset r1, lo
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
 ; CHECK-NEXT:    vmov r2, r4, d7
 ; CHECK-NEXT:    subs.w r2, r2, lr
 ; CHECK-NEXT:    sbcs.w r2, r4, r12
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    bfi r3, r2, #8, #8
 ; CHECK-NEXT:    vmov r2, r12, d0
 ; CHECK-NEXT:    vmsr p0, r3
@@ -582,16 +554,12 @@
 ; CHECK-NEXT:    subs r2, r4, r2
 ; CHECK-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-NEXT:    vmov r4, r3, d5
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    bfi r1, r2, #0, #8
 ; CHECK-NEXT:    vmov r2, r12, d1
 ; CHECK-NEXT:    subs r2, r4, r2
 ; CHECK-NEXT:    sbcs.w r2, r3, r12
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    bfi r1, r2, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q2
@@ -747,29 +715,25 @@
 ; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov lr, s2
 ; CHECK-NEXT:    asrs r2, r0, #31
 ; CHECK-NEXT:    asrs r3, r1, #31
 ; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, lr
-; CHECK-NEXT:    asr.w r12, lr, #31
-; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
+; CHECK-NEXT:    asr.w lr, r3, #31
 ; CHECK-NEXT:    subs r3, r3, r1
-; CHECK-NEXT:    sbcs.w r1, r2, r1, asr #31
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    bfi r2, r1, #0, #8
-; CHECK-NEXT:    subs.w r1, lr, r0
+; CHECK-NEXT:    sbcs.w r1, lr, r1, asr #31
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    csetm r1, lt
+; CHECK-NEXT:    asr.w r12, r2, #31
+; CHECK-NEXT:    bfi r3, r1, #0, #8
+; CHECK-NEXT:    subs r1, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r12, r0, asr #31
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    bfi r2, r0, #8, #8
-; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], lr, r12
+; CHECK-NEXT:    csetm r0, lt
+; CHECK-NEXT:    bfi r3, r0, #8, #8
+; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    pop {r7, pc}
   %c = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %a, <2 x i32> %b)
@@ -834,17 +798,13 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lt
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    vmov r3, r2, d1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r12, d3
 ; CHECK-NEXT:    subs r0, r3, r0
 ; CHECK-NEXT:    sbcs.w r0, r2, r12
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -866,17 +826,13 @@
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs.w r1, r2, r12
 ; CHECK-NEXT:    vmov lr, r12, d7
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
 ; CHECK-NEXT:    vmov r2, r4, d3
 ; CHECK-NEXT:    subs.w r2, r2, lr
 ; CHECK-NEXT:    sbcs.w r2, r4, r12
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    bfi r3, r2, #8, #8
 ; CHECK-NEXT:    vmov r2, r12, d4
 ; CHECK-NEXT:    vmsr p0, r3
@@ -886,16 +842,12 @@
 ; CHECK-NEXT:    subs r2, r4, r2
 ; CHECK-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-NEXT:    vmov r4, r3, d1
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    bfi r1, r2, #0, #8
 ; CHECK-NEXT:    vmov r2, r12, d5
 ; CHECK-NEXT:    subs r2, r4, r2
 ; CHECK-NEXT:    sbcs.w r2, r3, r12
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    bfi r1, r2, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q2
@@ -1054,17 +1006,13 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lo
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    vmov r3, r2, d1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r12, d3
 ; CHECK-NEXT:    subs r0, r3, r0
 ; CHECK-NEXT:    sbcs.w r0, r2, r12
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1131,17 +1079,13 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lo
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    vmov r3, r2, d1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r12, d3
 ; CHECK-NEXT:    subs r0, r3, r0
 ; CHECK-NEXT:    sbcs.w r0, r2, r12
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -1163,17 +1107,13 @@
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs.w r1, r2, r12
 ; CHECK-NEXT:    vmov lr, r12, d7
-; CHECK-NEXT:    cset r1, lo
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
 ; CHECK-NEXT:    vmov r2, r4, d3
 ; CHECK-NEXT:    subs.w r2, r2, lr
 ; CHECK-NEXT:    sbcs.w r2, r4, r12
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    bfi r3, r2, #8, #8
 ; CHECK-NEXT:    vmov r2, r12, d4
 ; CHECK-NEXT:    vmsr p0, r3
@@ -1183,16 +1123,12 @@
 ; CHECK-NEXT:    subs r2, r4, r2
 ; CHECK-NEXT:    sbcs.w r2, r3, r12
 ; CHECK-NEXT:    vmov r4, r3, d1
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    bfi r1, r2, #0, #8
 ; CHECK-NEXT:    vmov r2, r12, d5
 ; CHECK-NEXT:    subs r2, r4, r2
 ; CHECK-NEXT:    sbcs.w r2, r3, r12
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    bfi r1, r2, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q2
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
--- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
@@ -79,14 +79,10 @@
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    sbcs.w r0, r12, r1
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    rsbs r1, r2, #0
 ; CHECK-NEXT:    sbcs.w r1, r12, r3
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    bx lr
@@ -105,24 +101,22 @@
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vldr d0, .LCPI6_0
-; CHECK-NEXT:    vmov r0, r1, d8
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    vmov r4, r5, d0
 ; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpeq
 ; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vmov r0, r1, d9
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpeq
-; CHECK-NEXT:    clz r0, r0
-; CHECK-NEXT:    clz r1, r6
-; CHECK-NEXT:    lsrs r0, r0, #5
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    lsrs r1, r1, #5
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csetm r1, eq
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csetm r0, eq
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 3
@@ -218,14 +212,10 @@
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    sbcs.w r0, r12, r1
 ; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    rsbs r1, r2, #0
 ; CHECK-NEXT:    sbcs.w r1, r12, r3
 ; CHECK-NEXT:    vmov s2, r0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    cset r0, lt
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
@@ -247,27 +237,24 @@
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vldr d0, .LCPI13_0
-; CHECK-NEXT:    vmov r0, r1, d9
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    vmov r4, r5, d0
 ; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpeq
-; CHECK-NEXT:    vmov r2, r1, d8
-; CHECK-NEXT:    clz r0, r0
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    vldr s17, .LCPI13_1
-; CHECK-NEXT:    lsrs r0, r0, #5
-; CHECK-NEXT:    cset r6, ne
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpeq
-; CHECK-NEXT:    clz r0, r0
-; CHECK-NEXT:    vmov s18, r6
-; CHECK-NEXT:    vmov.f32 s19, s17
-; CHECK-NEXT:    lsrs r0, r0, #5
-; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    vldr s1, .LCPI13_1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 3
@@ -472,21 +459,18 @@
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, r1, d0
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    sbcs.w r0, r12, r1
-; CHECK-NEXT:    cset r4, lt
-; CHECK-NEXT:    rsbs r0, r2, #0
-; CHECK-NEXT:    sbcs.w r0, r12, r3
+; CHECK-NEXT:    sbcs.w r0, r4, r1
 ; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    bl __aeabi_ui2d
-; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    cset r2, ne
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    sbcs.w r2, r4, r3
+; CHECK-NEXT:    cset r2, lt
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl __aeabi_ui2d
 ; CHECK-NEXT:    vmov d8, r0, r1
@@ -506,21 +490,18 @@
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov r0, r1, d0
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    sbcs.w r0, r12, r1
-; CHECK-NEXT:    cset r4, lt
-; CHECK-NEXT:    rsbs r0, r2, #0
-; CHECK-NEXT:    sbcs.w r0, r12, r3
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    sbcs.w r0, r4, r1
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bl __aeabi_i2d
-; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    sbcs.w r2, r4, r3
+; CHECK-NEXT:    csetm r2, lt
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl __aeabi_i2d
 ; CHECK-NEXT:    vmov d8, r0, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -6,8 +6,8 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #12
+; CHECK-NEXT:    sub sp, #12
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB0_8
 ; CHECK-NEXT:  @ %bb.1: @ %entry
@@ -16,64 +16,57 @@
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r10, r1
-; CHECK-NEXT:    mov r11, r2
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    mov r10, r2
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    bic r3, r3, #1
-; CHECK-NEXT:    subs r7, r3, #2
+; CHECK-NEXT:    bic r5, r3, #1
 ; CHECK-NEXT:    adr r4, .LCPI0_0
+; CHECK-NEXT:    subs r7, r5, #2
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    add.w r3, r1, r5, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI0_1
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT:    add.w r11, r2, r3, lsl #2
-; CHECK-NEXT:    add.w r10, r1, r3, lsl #2
-; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
+; CHECK-NEXT:    add.w r10, r2, r5, lsl #2
+; CHECK-NEXT:    add.w r12, r0, r5, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrd r4, r5, [r0], #8
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    ldrd r4, r6, [r0], #8
+; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    ldrd r7, r8, [r1], #8
-; CHECK-NEXT:    smull r8, r5, r8, r5
-; CHECK-NEXT:    smull r4, r7, r7, r4
-; CHECK-NEXT:    asrl r8, r5, #31
-; CHECK-NEXT:    asrl r4, r7, #31
+; CHECK-NEXT:    smull r4, r11, r7, r4
+; CHECK-NEXT:    asrl r4, r11, #31
 ; CHECK-NEXT:    rsbs.w r9, r4, #-2147483648
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r8
 ; CHECK-NEXT:    mov.w r9, #-1
-; CHECK-NEXT:    sbcs.w r3, r9, r7
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    vmov q2[3], q2[1], r7, r5
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    bfi r6, r3, #0, #8
-; CHECK-NEXT:    rsbs.w r3, r8, #-2147483648
-; CHECK-NEXT:    sbcs.w r3, r9, r5
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    bfi r6, r3, #8, #8
-; CHECK-NEXT:    vmsr p0, r6
+; CHECK-NEXT:    sbcs.w r3, r9, r11
+; CHECK-NEXT:    csetm r3, lt
+; CHECK-NEXT:    bfi r5, r3, #0, #8
+; CHECK-NEXT:    smull r6, r3, r8, r6
+; CHECK-NEXT:    asrl r6, r3, #31
+; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r6
+; CHECK-NEXT:    sbcs.w r7, r9, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r11, r3
+; CHECK-NEXT:    csetm r7, lt
 ; CHECK-NEXT:    mvn r6, #-2147483648
+; CHECK-NEXT:    bfi r5, r7, #8, #8
+; CHECK-NEXT:    vmsr p0, r5
 ; CHECK-NEXT:    vpsel q2, q2, q0
 ; CHECK-NEXT:    vmov r3, r4, d4
 ; CHECK-NEXT:    subs r3, r3, r6
 ; CHECK-NEXT:    sbcs r3, r4, #0
 ; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #0, #8
 ; CHECK-NEXT:    vmov r3, r5, d5
 ; CHECK-NEXT:    subs r3, r3, r6
 ; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #8, #8
 ; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vpsel q2, q2, q1
@@ -83,6 +76,7 @@
 ; CHECK-NEXT:    le lr, .LBB0_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    ldrd r7, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r7, r3
 ; CHECK-NEXT:    beq .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader
@@ -93,7 +87,7 @@
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r12], #4
-; CHECK-NEXT:    ldr r4, [r10], #4
+; CHECK-NEXT:    ldr r4, [r6], #4
 ; CHECK-NEXT:    smull r4, r3, r4, r3
 ; CHECK-NEXT:    asrl r4, r3, #31
 ; CHECK-NEXT:    subs r5, r1, r4
@@ -105,10 +99,10 @@
 ; CHECK-NEXT:    subs r5, r4, r2
 ; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    csel r3, r4, r2, lt
-; CHECK-NEXT:    str r3, [r11], #4
+; CHECK-NEXT:    str r3, [r10], #4
 ; CHECK-NEXT:    le lr, .LBB0_7
 ; CHECK-NEXT:  .LBB0_8: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #12
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.9:
@@ -212,122 +206,108 @@
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB1_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB1_3
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov r9, r5
 ; CHECK-NEXT:    mov r11, r2
 ; CHECK-NEXT:    b .LBB1_6
 ; CHECK-NEXT:  .LBB1_3: @ %vector.ph
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    subs r7, r3, #4
+; CHECK-NEXT:    bic r1, r3, #3
 ; CHECK-NEXT:    adr r4, .LCPI1_0
+; CHECK-NEXT:    subs r7, r1, #4
 ; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
-; CHECK-NEXT:    add.w r7, r1, r3, lsl #2
-; CHECK-NEXT:    strd r7, r3, [sp, #4] @ 8-byte Folded Spill
 ; CHECK-NEXT:    adr r4, .LCPI1_1
-; CHECK-NEXT:    add.w r11, r2, r3, lsl #2
-; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT:    add.w r11, r2, r1, lsl #2
+; CHECK-NEXT:    add.w r9, r5, r1, lsl #2
+; CHECK-NEXT:    add.w r12, r0, r1, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    mov.w r9, #-1
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q4, [r5], #16
 ; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q4, [r1], #16
-; CHECK-NEXT:    mov.w r3, #-1
-; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    mov.w r2, #-1
 ; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    mov.w r6, #-1
+; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    vmov.f32 s20, s18
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    vmov.f32 s10, s15
 ; CHECK-NEXT:    vmov.f32 s22, s19
 ; CHECK-NEXT:    vmullb.s32 q6, q5, q2
-; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov.f32 s18, s17
 ; CHECK-NEXT:    vmov r4, r7, d12
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    vmov.f32 s18, s17
+; CHECK-NEXT:    vmov.f32 s14, s13
 ; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
-; CHECK-NEXT:    sbcs.w r5, r3, r7
-; CHECK-NEXT:    cset r5, lt
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    sbcs.w r5, r2, r7
+; CHECK-NEXT:    csetm r5, lt
 ; CHECK-NEXT:    bfi r8, r5, #0, #8
 ; CHECK-NEXT:    vmov r10, r5, d13
 ; CHECK-NEXT:    asrl r10, r5, #31
+; CHECK-NEXT:    vmov r6, s18
 ; CHECK-NEXT:    rsbs.w r3, r10, #-2147483648
 ; CHECK-NEXT:    vmov q2[2], q2[0], r4, r10
-; CHECK-NEXT:    sbcs.w r3, r6, r5
+; CHECK-NEXT:    sbcs.w r3, r2, r5
 ; CHECK-NEXT:    vmov q2[3], q2[1], r7, r5
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    mvn r10, #-2147483648
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    mov.w r6, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r8, r3, #8, #8
 ; CHECK-NEXT:    vmsr p0, r8
+; CHECK-NEXT:    mvn r8, #-2147483648
 ; CHECK-NEXT:    vpsel q2, q2, q0
 ; CHECK-NEXT:    vmov r3, r4, d4
-; CHECK-NEXT:    subs.w r3, r3, r10
+; CHECK-NEXT:    subs.w r3, r3, r8
 ; CHECK-NEXT:    sbcs r3, r4, #0
 ; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #0, #8
 ; CHECK-NEXT:    vmov r3, r5, d5
-; CHECK-NEXT:    subs.w r3, r3, r10
+; CHECK-NEXT:    subs.w r3, r3, r8
 ; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    mov.w r5, #0
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #8, #8
 ; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    vpsel q2, q2, q1
-; CHECK-NEXT:    smull r8, r7, r4, r3
-; CHECK-NEXT:    asrl r8, r7, #31
-; CHECK-NEXT:    rsbs.w r3, r8, #-2147483648
-; CHECK-NEXT:    sbcs.w r3, r9, r7
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    bfi r6, r3, #0, #8
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    smull r4, r5, r5, r3
-; CHECK-NEXT:    asrl r4, r5, #31
+; CHECK-NEXT:    smull r4, r7, r4, r3
+; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
-; CHECK-NEXT:    vmov q3[2], q3[0], r8, r4
-; CHECK-NEXT:    sbcs.w r3, r9, r5
-; CHECK-NEXT:    vmov q3[3], q3[1], r7, r5
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    bfi r6, r3, #8, #8
-; CHECK-NEXT:    vmsr p0, r6
+; CHECK-NEXT:    sbcs.w r3, r2, r7
+; CHECK-NEXT:    csetm r3, lt
+; CHECK-NEXT:    bfi r5, r3, #0, #8
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    smull r6, r3, r6, r3
+; CHECK-NEXT:    asrl r6, r3, #31
+; CHECK-NEXT:    rsbs.w r1, r6, #-2147483648
+; CHECK-NEXT:    vmov q3[2], q3[0], r4, r6
+; CHECK-NEXT:    sbcs.w r1, r2, r3
+; CHECK-NEXT:    vmov q3[3], q3[1], r7, r3
+; CHECK-NEXT:    csetm r1, lt
+; CHECK-NEXT:    bfi r5, r1, #8, #8
+; CHECK-NEXT:    vmsr p0, r5
+; CHECK-NEXT:    ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload
 ; CHECK-NEXT:    vpsel q3, q3, q0
-; CHECK-NEXT:    vmov r3, r4, d6
-; CHECK-NEXT:    subs.w r3, r3, r10
-; CHECK-NEXT:    sbcs r3, r4, #0
-; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    bfi r4, r3, #0, #8
-; CHECK-NEXT:    vmov r3, r5, d7
-; CHECK-NEXT:    subs.w r3, r3, r10
-; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    bfi r4, r3, #8, #8
-; CHECK-NEXT:    vmsr p0, r4
+; CHECK-NEXT:    vmov r1, r3, d6
+; CHECK-NEXT:    subs.w r1, r1, r8
+; CHECK-NEXT:    sbcs r1, r3, #0
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    csetm r1, lt
+; CHECK-NEXT:    bfi r3, r1, #0, #8
+; CHECK-NEXT:    vmov r1, r4, d7
+; CHECK-NEXT:    subs.w r1, r1, r8
+; CHECK-NEXT:    sbcs r1, r4, #0
+; CHECK-NEXT:    csetm r1, lt
+; CHECK-NEXT:    bfi r3, r1, #8, #8
+; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    vpsel q3, q3, q1
 ; CHECK-NEXT:    vmov.f32 s13, s14
 ; CHECK-NEXT:    vmov.f32 s14, s8
@@ -335,31 +315,30 @@
 ; CHECK-NEXT:    vstrb.8 q3, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
-; CHECK-NEXT:    ldrd r7, r3, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    cmp r7, r3
+; CHECK-NEXT:    ldrd r1, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    cmp r1, r3
 ; CHECK-NEXT:    beq .LBB1_8
 ; CHECK-NEXT:  .LBB1_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r7
-; CHECK-NEXT:    mov.w r1, #-1
+; CHECK-NEXT:    sub.w lr, r3, r1
+; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:    mov.w r3, #-2147483648
 ; CHECK-NEXT:    mvn r2, #-2147483648
 ; CHECK-NEXT:  .LBB1_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r4, [r12], #4
-; CHECK-NEXT:    ldr r5, [r0], #4
-; CHECK-NEXT:    smull r4, r5, r5, r4
-; CHECK-NEXT:    asrl r4, r5, #31
-; CHECK-NEXT:    subs r6, r3, r4
-; CHECK-NEXT:    sbcs.w r6, r1, r5
-; CHECK-NEXT:    cset r6, lt
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    ldr r1, [r12], #4
+; CHECK-NEXT:    ldr r4, [r9], #4
+; CHECK-NEXT:    smull r4, r1, r4, r1
+; CHECK-NEXT:    asrl r4, r1, #31
+; CHECK-NEXT:    subs r5, r3, r4
+; CHECK-NEXT:    sbcs.w r5, r0, r1
+; CHECK-NEXT:    cset r5, lt
+; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csel r4, r4, r3, ne
-; CHECK-NEXT:    csel r5, r5, r1, ne
-; CHECK-NEXT:    subs r6, r4, r2
-; CHECK-NEXT:    sbcs r5, r5, #0
-; CHECK-NEXT:    csel r4, r4, r2, lt
-; CHECK-NEXT:    str r4, [r11], #4
+; CHECK-NEXT:    csel r1, r1, r0, ne
+; CHECK-NEXT:    subs r5, r4, r2
+; CHECK-NEXT:    sbcs r1, r1, #0
+; CHECK-NEXT:    csel r1, r4, r2, lt
+; CHECK-NEXT:    str r1, [r11], #4
 ; CHECK-NEXT:    le lr, .LBB1_7
 ; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #16
@@ -468,21 +447,21 @@
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB2_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adds r7, r3, #3
-; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    bic r7, r7, #3
+; CHECK-NEXT:    adds r6, r3, #3
+; CHECK-NEXT:    movs r5, #1
+; CHECK-NEXT:    bic r6, r6, #3
 ; CHECK-NEXT:    adr r4, .LCPI2_1
-; CHECK-NEXT:    subs r7, #4
-; CHECK-NEXT:    adr r5, .LCPI2_2
+; CHECK-NEXT:    subs r6, #4
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
-; CHECK-NEXT:    vldrw.u32 q3, [r5]
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
-; CHECK-NEXT:    adr r6, .LCPI2_0
-; CHECK-NEXT:    subs r7, r3, #1
-; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    vdup.32 q1, r7
 ; CHECK-NEXT:    mov.w r12, #-1
+; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
+; CHECK-NEXT:    adr r5, .LCPI2_0
+; CHECK-NEXT:    vldrw.u32 q0, [r5]
+; CHECK-NEXT:    adr r5, .LCPI2_2
+; CHECK-NEXT:    subs r6, r3, #1
+; CHECK-NEXT:    vldrw.u32 q3, [r5]
+; CHECK-NEXT:    vdup.32 q1, r6
 ; CHECK-NEXT:    mvn r8, #-2147483648
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
@@ -502,14 +481,12 @@
 ; CHECK-NEXT:    vmov.f32 s28, s26
 ; CHECK-NEXT:    vmov.f32 s30, s27
 ; CHECK-NEXT:    vmullb.s32 q0, q7, q4
-; CHECK-NEXT:    vmov.f32 s22, s21
+; CHECK-NEXT:    vmov.f32 s22, s25
 ; CHECK-NEXT:    vmov r10, r5, d0
 ; CHECK-NEXT:    asrl r10, r5, #31
 ; CHECK-NEXT:    rsbs.w r7, r10, #-2147483648
 ; CHECK-NEXT:    sbcs.w r7, r12, r5
-; CHECK-NEXT:    cset r7, lt
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    csetm r7, ne
+; CHECK-NEXT:    csetm r7, lt
 ; CHECK-NEXT:    bfi r4, r7, #0, #8
 ; CHECK-NEXT:    vmov r6, r7, d1
 ; CHECK-NEXT:    asrl r6, r7, #31
@@ -517,72 +494,58 @@
 ; CHECK-NEXT:    vmov q0[2], q0[0], r10, r6
 ; CHECK-NEXT:    sbcs.w r3, r12, r7
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r7
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
+; CHECK-NEXT:    vmov r7, s22
 ; CHECK-NEXT:    bfi r4, r3, #8, #8
 ; CHECK-NEXT:    vmsr p0, r4
-; CHECK-NEXT:    vpsel q4, q0, q2
-; CHECK-NEXT:    vmov.f32 s2, s25
-; CHECK-NEXT:    vmov r3, r4, d8
-; CHECK-NEXT:    vmov r7, s2
+; CHECK-NEXT:    vpsel q0, q0, q2
+; CHECK-NEXT:    vmov r3, r4, d0
 ; CHECK-NEXT:    subs.w r3, r3, r8
 ; CHECK-NEXT:    sbcs r3, r4, #0
 ; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #0, #8
-; CHECK-NEXT:    vmov r3, r5, d9
+; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    subs.w r3, r3, r8
 ; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #8, #8
 ; CHECK-NEXT:    vmov r3, s20
 ; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vpsel q4, q4, q3
+; CHECK-NEXT:    vpsel q4, q0, q3
+; CHECK-NEXT:    vmov.f32 s2, s21
 ; CHECK-NEXT:    smull r10, r5, r4, r3
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    asrl r10, r5, #31
 ; CHECK-NEXT:    rsbs.w r3, r10, #-2147483648
 ; CHECK-NEXT:    sbcs.w r3, r12, r5
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #0, #8
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    smull r6, r7, r7, r3
-; CHECK-NEXT:    asrl r6, r7, #31
-; CHECK-NEXT:    rsbs.w r3, r6, #-2147483648
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    smull r6, r3, r7, r3
+; CHECK-NEXT:    asrl r6, r3, #31
+; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
 ; CHECK-NEXT:    vmov q0[2], q0[0], r10, r6
-; CHECK-NEXT:    sbcs.w r3, r12, r7
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r7
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    bfi r4, r3, #8, #8
+; CHECK-NEXT:    sbcs.w r7, r12, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
+; CHECK-NEXT:    csetm r7, lt
+; CHECK-NEXT:    bfi r4, r7, #8, #8
 ; CHECK-NEXT:    vmsr p0, r4
-; CHECK-NEXT:    vpsel q5, q0, q2
-; CHECK-NEXT:    vmov r3, r4, d10
+; CHECK-NEXT:    vpsel q0, q0, q2
+; CHECK-NEXT:    vmov r3, r4, d0
 ; CHECK-NEXT:    subs.w r3, r3, r8
 ; CHECK-NEXT:    sbcs r3, r4, #0
 ; CHECK-NEXT:    mov.w r4, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #0, #8
-; CHECK-NEXT:    vmov r3, r5, d11
+; CHECK-NEXT:    vmov r3, r5, d1
 ; CHECK-NEXT:    subs.w r3, r3, r8
 ; CHECK-NEXT:    sbcs r3, r5, #0
-; CHECK-NEXT:    cset r3, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #8, #8
 ; CHECK-NEXT:    vmsr p0, r4
-; CHECK-NEXT:    vpsel q0, q5, q3
+; CHECK-NEXT:    vpsel q0, q0, q3
 ; CHECK-NEXT:    vldr p0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s16
@@ -693,9 +656,7 @@
 ; CHECK-NEXT:    lsrl r4, r9, #31
 ; CHECK-NEXT:    subs.w r5, r4, #-1
 ; CHECK-NEXT:    sbcs r5, r9, #0
-; CHECK-NEXT:    cset r5, lo
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    csetm r5, lo
 ; CHECK-NEXT:    bfi r8, r5, #0, #8
 ; CHECK-NEXT:    umull r6, r5, r3, r6
 ; CHECK-NEXT:    lsrl r6, r5, #31
@@ -703,9 +664,7 @@
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r6
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r9, r5
-; CHECK-NEXT:    cset r3, lo
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lo
 ; CHECK-NEXT:    bfi r8, r3, #8, #8
 ; CHECK-NEXT:    vmsr p0, r8
 ; CHECK-NEXT:    vpsel q1, q1, q0
@@ -858,9 +817,7 @@
 ; CHECK-NEXT:    subs.w r5, r4, #-1
 ; CHECK-NEXT:    vmullb.u32 q4, q3, q1
 ; CHECK-NEXT:    sbcs r5, r9, #0
-; CHECK-NEXT:    cset r5, lo
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    csetm r5, lo
 ; CHECK-NEXT:    bfi r6, r5, #0, #8
 ; CHECK-NEXT:    vmov r8, r5, d11
 ; CHECK-NEXT:    lsrl r8, r5, #31
@@ -868,9 +825,7 @@
 ; CHECK-NEXT:    vmov q2[2], q2[0], r4, r8
 ; CHECK-NEXT:    sbcs r7, r5, #0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r9, r5
-; CHECK-NEXT:    cset r7, lo
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    csetm r7, ne
+; CHECK-NEXT:    csetm r7, lo
 ; CHECK-NEXT:    bfi r6, r7, #8, #8
 ; CHECK-NEXT:    vmov r4, r7, d8
 ; CHECK-NEXT:    lsrl r4, r7, #31
@@ -879,9 +834,7 @@
 ; CHECK-NEXT:    mov.w r6, #0
 ; CHECK-NEXT:    sbcs r5, r7, #0
 ; CHECK-NEXT:    vpsel q2, q2, q0
-; CHECK-NEXT:    cset r5, lo
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
+; CHECK-NEXT:    csetm r5, lo
 ; CHECK-NEXT:    bfi r6, r5, #0, #8
 ; CHECK-NEXT:    vmov r2, r5, d9
 ; CHECK-NEXT:    lsrl r2, r5, #31
@@ -889,9 +842,7 @@
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r2
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    vmov q1[3], q1[1], r7, r5
-; CHECK-NEXT:    cset r3, lo
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    csetm r3, ne
+; CHECK-NEXT:    csetm r3, lo
 ; CHECK-NEXT:    bfi r6, r3, #8, #8
 ; CHECK-NEXT:    vmsr p0, r6
 ; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -36,21 +36,19 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r0, r2, d2
-; CHECK-NEXT:    vmov r3, r1, d0
-; CHECK-NEXT:    adds.w r12, r3, r0
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    adds.w r12, r2, r0
 ; CHECK-NEXT:    vmov r0, r4, d1
-; CHECK-NEXT:    adc.w lr, r1, r2
-; CHECK-NEXT:    subs.w r3, r12, r3
-; CHECK-NEXT:    sbcs.w r1, lr, r1
-; CHECK-NEXT:    cset r1, lt
+; CHECK-NEXT:    adc.w lr, r3, r1
+; CHECK-NEXT:    subs.w r2, r12, r2
+; CHECK-NEXT:    sbcs.w r2, lr, r3
+; CHECK-NEXT:    cset r2, lt
 ; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it mi
-; CHECK-NEXT:    eormi r1, r1, #1
+; CHECK-NEXT:    eormi r2, r2, #1
+; CHECK-NEXT:    rsbs r1, r2, #0
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    rsbs r1, r1, #0
 ; CHECK-NEXT:    bfi r2, r1, #0, #8
 ; CHECK-NEXT:    vmov r1, r3, d3
 ; CHECK-NEXT:    adds r1, r1, r0
@@ -59,14 +57,12 @@
 ; CHECK-NEXT:    sbcs.w r0, r5, r4
 ; CHECK-NEXT:    vmov q0[2], q0[0], r12, r1
 ; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    asr.w r1, lr, #31
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], lr, r5
-; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    it mi
 ; CHECK-NEXT:    eormi r0, r0, #1
+; CHECK-NEXT:    asr.w r1, lr, #31
 ; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], lr, r5
 ; CHECK-NEXT:    bfi r2, r0, #8, #8
 ; CHECK-NEXT:    asrs r0, r5, #31
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
@@ -122,32 +118,28 @@
 define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: uadd_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    adds.w lr, r2, r0
-; CHECK-NEXT:    vmov r0, r4, d0
-; CHECK-NEXT:    adc.w r12, r3, r1
-; CHECK-NEXT:    subs.w r2, lr, r2
-; CHECK-NEXT:    sbcs.w r2, r12, r3
-; CHECK-NEXT:    vmov r3, r1, d2
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    adds r3, r3, r0
-; CHECK-NEXT:    adcs r1, r4
-; CHECK-NEXT:    subs r0, r3, r0
-; CHECK-NEXT:    sbcs.w r0, r1, r4
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, lr
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r12
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    adds r5, r2, r0
+; CHECK-NEXT:    adc.w lr, r3, r1
+; CHECK-NEXT:    subs r2, r5, r2
+; CHECK-NEXT:    sbcs.w r2, lr, r3
+; CHECK-NEXT:    vmov r3, r12, d2
+; CHECK-NEXT:    vmov r1, r4, d0
+; CHECK-NEXT:    csetm r2, lo
+; CHECK-NEXT:    adds r3, r3, r1
+; CHECK-NEXT:    adc.w r0, r4, r12
+; CHECK-NEXT:    subs r1, r3, r1
+; CHECK-NEXT:    sbcs.w r1, r0, r4
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r5
+; CHECK-NEXT:    csetm r1, lo
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, lr
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r2
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
@@ -187,55 +179,47 @@
 define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: ssub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r1, r3, d2
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    rsbs r2, r1, #0
-; CHECK-NEXT:    sbcs.w r2, r0, r3
-; CHECK-NEXT:    vmov r2, r4, d0
-; CHECK-NEXT:    cset lr, lt
-; CHECK-NEXT:    subs.w r12, r2, r1
-; CHECK-NEXT:    sbc.w r5, r4, r3
-; CHECK-NEXT:    subs.w r2, r12, r2
-; CHECK-NEXT:    sbcs.w r2, r5, r4
-; CHECK-NEXT:    vmov r3, r4, d3
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, ne
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    eorne r2, r2, #1
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vmov r2, r3, d2
+; CHECK-NEXT:    vmov r1, r0, d0
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    subs.w r12, r1, r2
+; CHECK-NEXT:    sbc.w lr, r0, r3
+; CHECK-NEXT:    subs.w r1, r12, r1
+; CHECK-NEXT:    sbcs.w r0, lr, r0
+; CHECK-NEXT:    mov.w r1, #0
+; CHECK-NEXT:    cset r0, lt
 ; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    rsbs r1, r3, #0
-; CHECK-NEXT:    sbcs.w r1, r0, r4
-; CHECK-NEXT:    bfi r0, r2, #0, #8
-; CHECK-NEXT:    vmov r2, r1, d1
-; CHECK-NEXT:    cset lr, lt
-; CHECK-NEXT:    subs r3, r2, r3
-; CHECK-NEXT:    sbc.w r4, r1, r4
-; CHECK-NEXT:    subs r2, r3, r2
-; CHECK-NEXT:    sbcs.w r1, r4, r1
-; CHECK-NEXT:    vmov q0[2], q0[0], r12, r3
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    eorne r1, r1, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi r0, r1, #8, #8
-; CHECK-NEXT:    asrs r1, r5, #31
-; CHECK-NEXT:    vmsr p0, r0
-; CHECK-NEXT:    asrs r0, r4, #31
+; CHECK-NEXT:    sbcs.w r2, r1, r3
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    eorlt r0, r0, #1
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    subs r6, r4, r2
+; CHECK-NEXT:    sbc.w r7, r5, r3
+; CHECK-NEXT:    subs r4, r6, r4
+; CHECK-NEXT:    sbcs.w r4, r7, r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r12, r6
+; CHECK-NEXT:    cset r4, lt
+; CHECK-NEXT:    rsbs r2, r2, #0
+; CHECK-NEXT:    sbcs.w r2, r1, r3
+; CHECK-NEXT:    bfi r1, r0, #0, #8
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    eorlt r4, r4, #1
+; CHECK-NEXT:    rsbs r0, r4, #0
+; CHECK-NEXT:    bfi r1, r0, #8, #8
+; CHECK-NEXT:    asrs r0, r7, #31
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    asr.w r1, lr, #31
 ; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], lr, r7
 ; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
 ; CHECK-NEXT:    adr r0, .LCPI11_0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    veor q1, q1, q2
 ; CHECK-NEXT:    vpsel q0, q1, q0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI11_0:
@@ -281,32 +265,28 @@
 define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: usub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    subs.w lr, r2, r0
-; CHECK-NEXT:    vmov r0, r4, d0
-; CHECK-NEXT:    sbc.w r12, r3, r1
-; CHECK-NEXT:    subs.w r2, r2, lr
-; CHECK-NEXT:    sbcs.w r2, r3, r12
-; CHECK-NEXT:    vmov r3, r1, d2
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
-; CHECK-NEXT:    subs r3, r0, r3
-; CHECK-NEXT:    sbc.w r1, r4, r1
-; CHECK-NEXT:    subs r0, r0, r3
-; CHECK-NEXT:    sbcs.w r0, r4, r1
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, lr
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r12
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r2
+; CHECK-NEXT:    subs r5, r2, r0
+; CHECK-NEXT:    sbc.w lr, r3, r1
+; CHECK-NEXT:    subs r2, r2, r5
+; CHECK-NEXT:    sbcs.w r2, r3, lr
+; CHECK-NEXT:    vmov r3, r12, d2
+; CHECK-NEXT:    vmov r1, r4, d0
+; CHECK-NEXT:    csetm r2, lo
+; CHECK-NEXT:    subs r3, r1, r3
+; CHECK-NEXT:    sbc.w r0, r4, r12
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    sbcs.w r1, r4, r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r5
+; CHECK-NEXT:    csetm r1, lo
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, lr
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r2
 ; CHECK-NEXT:    vbic q0, q1, q0
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll
@@ -399,17 +399,13 @@
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    sbcs.w r0, r3, r1
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lt
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    vmov r3, r2, d1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r12, d3
 ; CHECK-NEXT:    subs r0, r3, r0
 ; CHECK-NEXT:    sbcs.w r0, r2, r12
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q2, q3
@@ -470,8 +466,6 @@
 ; CHECK-NEXT:    sbcs.w r2, r12, r3, asr #31
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, ne
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r1, ne
 ; CHECK-NEXT:    cmp r3, #0
@@ -487,8 +481,6 @@
 ; CHECK-NEXT:    sbcs.w r2, r12, r3, asr #31
 ; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, ne
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r1, ne
 ; CHECK-NEXT:    cmp r3, #0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
@@ -279,25 +279,17 @@
 define arm_aapcs_vfpcc <4 x float> @vcmp_une_v4f32(<4 x float> %src, <4 x float> %src2, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s1, s5
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, s7
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s2, s6
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    cmp r2, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
-; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s1, s5
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
-; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -1122,69 +1114,53 @@
 define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s12
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    vins.f16 s0, s16
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s13
 ; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s16, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
 ; CHECK-MVE-NEXT:    vins.f16 s1, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s5, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
 ; CHECK-MVE-NEXT:    vins.f16 s2, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
 ; CHECK-MVE-NEXT:    vins.f16 s3, s4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_une_v8f16:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
@@ -300,25 +300,17 @@
 define arm_aapcs_vfpcc <4 x float> @vcmp_une_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s1, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s2, s4
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    cmp r2, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
-; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s1, s4
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
-; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -1137,59 +1129,43 @@
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s7, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s7, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s13
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s14
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
 ; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
 ; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s10, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
 ; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
@@ -1961,25 +1937,17 @@
 define arm_aapcs_vfpcc <4 x float> @vcmp_r_une_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    cmp r2, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
-; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
-; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -2798,59 +2766,43 @@
 ; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s7, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s7, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s13
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s14
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
 ; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
 ; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s10, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
 ; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
@@ -279,25 +279,17 @@
 define arm_aapcs_vfpcc <4 x float> @vcmp_une_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    cmp r2, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
-; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
-; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -1074,59 +1066,43 @@
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
 ; CHECK-MVE-NEXT:    vins.f16 s1, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    vins.f16 s2, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
 ; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
@@ -1856,25 +1832,17 @@
 define arm_aapcs_vfpcc <4 x float> @vcmp_r_une_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    cmp r2, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
-; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
-; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -2651,59 +2619,43 @@
 ; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
 ; CHECK-MVE-NEXT:    vins.f16 s1, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    vins.f16 s2, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
 ; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
@@ -507,8 +507,6 @@
 ; CHECK-NEXT:    sbcs.w r2, r12, r3, asr #31
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, ne
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r1, ne
 ; CHECK-NEXT:    cmp r3, #0
@@ -524,8 +522,6 @@
 ; CHECK-NEXT:    sbcs.w r2, r12, r3, asr #31
 ; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, ne
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r1, ne
 ; CHECK-NEXT:    cmp r3, #0
@@ -1056,8 +1052,6 @@
 ; CHECK-NEXT:    sbcs.w r2, r12, r3, asr #31
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, ne
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r1, ne
 ; CHECK-NEXT:    cmp r3, #0
@@ -1073,8 +1067,6 @@
 ; CHECK-NEXT:    sbcs.w r2, r12, r3, asr #31
 ; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r2, ne
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r1, ne
 ; CHECK-NEXT:    cmp r3, #0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll
@@ -169,17 +169,13 @@
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    subs.w r0, r0, r12
 ; CHECK-NEXT:    sbcs r0, r1, #0
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov.w r0, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    bfi r3, r1, #0, #8
 ; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    subs.w r1, r1, r12
 ; CHECK-NEXT:    sbcs r1, r2, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r3, r1, #8, #8
 ; CHECK-NEXT:    adr r1, .LCPI12_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
@@ -189,16 +185,12 @@
 ; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
 ; CHECK-NEXT:    sbcs.w r1, r3, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #0, #8
 ; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
 ; CHECK-NEXT:    sbcs.w r1, r3, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    adr r0, .LCPI12_1
@@ -233,17 +225,13 @@
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    rsbs.w r0, r0, #-2147483648
 ; CHECK-NEXT:    sbcs.w r0, r12, r1
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov.w r0, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    bfi r3, r1, #0, #8
 ; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
 ; CHECK-NEXT:    sbcs.w r1, r12, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r3, r1, #8, #8
 ; CHECK-NEXT:    adr r1, .LCPI13_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
@@ -253,16 +241,12 @@
 ; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    sbcs r1, r2, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #0, #8
 ; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    subs r1, r1, r3
 ; CHECK-NEXT:    sbcs r1, r2, #0
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    adr r0, .LCPI13_1
@@ -297,16 +281,12 @@
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r2, d1
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r2, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -325,16 +305,12 @@
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r2, d1
 ; CHECK-NEXT:    subs.w r0, r0, #-1
 ; CHECK-NEXT:    sbcs r0, r2, #0
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r1, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll
@@ -182,49 +182,41 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov r2, r1, d1
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    mvn r12, #-2147483648
-; CHECK-NEXT:    vmov r0, r3, d0
-; CHECK-NEXT:    asrl r2, r1, #3
-; CHECK-NEXT:    asrl r0, r3, #3
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
-; CHECK-NEXT:    subs.w r0, r0, r12
-; CHECK-NEXT:    sbcs r0, r3, #0
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    asrl r0, r1, #3
+; CHECK-NEXT:    asrl r2, r3, #3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    subs.w r2, r2, r12
+; CHECK-NEXT:    sbcs r2, r3, #0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    cset r0, lt
+; CHECK-NEXT:    csetm lr, lt
+; CHECK-NEXT:    subs.w r0, r0, r12
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    sbcs r0, r1, #0
+; CHECK-NEXT:    bfi r2, lr, #0, #8
+; CHECK-NEXT:    csetm r0, lt
+; CHECK-NEXT:    bfi r2, r0, #8, #8
+; CHECK-NEXT:    adr r0, .LCPI12_0
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    mov.w r2, #-1
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov.w r0, #0
-; CHECK-NEXT:    csetm lr, ne
-; CHECK-NEXT:    subs.w r2, r2, r12
-; CHECK-NEXT:    sbcs r1, r1, #0
-; CHECK-NEXT:    bfi r3, lr, #0, #8
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    bfi r3, r1, #8, #8
-; CHECK-NEXT:    adr r1, .LCPI12_0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vmsr p0, r3
-; CHECK-NEXT:    mov.w r3, #-1
 ; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vmov r1, r2, d0
-; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r3, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    bfi r0, r1, #0, #8
-; CHECK-NEXT:    vmov r1, r2, d1
-; CHECK-NEXT:    rsbs.w r1, r1, #-2147483648
-; CHECK-NEXT:    sbcs.w r1, r3, r2
-; CHECK-NEXT:    cset r1, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    csetm r1, ne
-; CHECK-NEXT:    bfi r0, r1, #8, #8
-; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    rsbs.w r0, r0, #-2147483648
+; CHECK-NEXT:    sbcs.w r0, r2, r1
+; CHECK-NEXT:    csetm r0, lt
+; CHECK-NEXT:    bfi r3, r0, #0, #8
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    rsbs.w r0, r0, #-2147483648
+; CHECK-NEXT:    sbcs.w r0, r2, r1
+; CHECK-NEXT:    csetm r0, lt
+; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    adr r0, .LCPI12_1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 4
@@ -251,53 +243,45 @@
 define arm_aapcs_vfpcc <2 x i64> @vqshrni64_sminmax(<2 x i64> %so) {
 ; CHECK-LABEL: vqshrni64_sminmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vmov r2, r1, d0
 ; CHECK-NEXT:    mov.w r12, #-1
 ; CHECK-NEXT:    asrl r2, r1, #3
-; CHECK-NEXT:    mov.w lr, #0
+; CHECK-NEXT:    vmov r4, r5, d1
 ; CHECK-NEXT:    rsbs.w r0, r2, #-2147483648
+; CHECK-NEXT:    asrl r4, r5, #3
 ; CHECK-NEXT:    sbcs.w r0, r12, r1
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov.w r0, #0
-; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    bfi r0, r3, #0, #8
-; CHECK-NEXT:    vmov r4, r3, d1
-; CHECK-NEXT:    asrl r4, r3, #3
-; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
+; CHECK-NEXT:    mov.w r3, #0
+; CHECK-NEXT:    csetm lr, lt
+; CHECK-NEXT:    rsbs.w r0, r4, #-2147483648
+; CHECK-NEXT:    sbcs.w r0, r12, r5
+; CHECK-NEXT:    bfi r3, lr, #0, #8
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
-; CHECK-NEXT:    sbcs.w r5, r12, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
-; CHECK-NEXT:    cset r5, lt
-; CHECK-NEXT:    mvn r2, #-2147483648
-; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csetm r5, ne
-; CHECK-NEXT:    bfi r0, r5, #8, #8
-; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    adr r0, .LCPI13_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmsr p0, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r5
+; CHECK-NEXT:    mvn r2, #-2147483648
 ; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    movs r6, #0
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    subs r0, r0, r2
 ; CHECK-NEXT:    sbcs r0, r1, #0
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    bfi lr, r0, #0, #8
+; CHECK-NEXT:    csetm r0, lt
+; CHECK-NEXT:    bfi r6, r0, #0, #8
 ; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    subs r0, r0, r2
 ; CHECK-NEXT:    sbcs r0, r1, #0
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    bfi lr, r0, #8, #8
+; CHECK-NEXT:    csetm r0, lt
+; CHECK-NEXT:    bfi r6, r0, #8, #8
 ; CHECK-NEXT:    adr r0, .LCPI13_1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmsr p0, lr
+; CHECK-NEXT:    vmsr p0, r6
 ; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI13_0:
@@ -331,16 +315,12 @@
 ; CHECK-NEXT:    subs.w r2, r2, #-1
 ; CHECK-NEXT:    sbcs r2, r3, #0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    subs.w r0, r0, #-1
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -364,16 +344,12 @@
 ; CHECK-NEXT:    subs.w r2, r2, #-1
 ; CHECK-NEXT:    sbcs r2, r3, #0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    cset r2, lo
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    csetm r2, lo
 ; CHECK-NEXT:    subs.w r0, r0, #-1
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    bfi r3, r2, #0, #8
-; CHECK-NEXT:    cset r0, lo
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    bfi r3, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
diff --git a/llvm/test/DebugInfo/X86/skeleton-unit-verify.s b/llvm/test/DebugInfo/X86/skeleton-unit-verify.s
--- a/llvm/test/DebugInfo/X86/skeleton-unit-verify.s
+++ b/llvm/test/DebugInfo/X86/skeleton-unit-verify.s
@@ -5,6 +5,8 @@
 # CHECK-NEXT: Verifying .debug_info Unit Header Chain...
 # CHECK-NEXT: Verifying .debug_types Unit Header Chain...
 # CHECK-NEXT: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 2, "test.cpp"
+# CHECK-NEXT: Verifying unit: 2 / 2{{$}}
 # CHECK-NEXT: warning: DW_TAG_skeleton_unit has DW_CHILDREN_yes but DIE has no children
 # CHECK-NEXT: DW_TAG_skeleton_unit
 # CHECK-NEXT: error: Skeleton compilation unit has children.
@@ -15,6 +17,8 @@
         .byte   1                       # Abbreviation Code
         .byte   74                      # DW_TAG_skeleton_unit
         .byte   0                       # DW_CHILDREN_no
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
         .byte   0                       # EOM(1)
         .byte   0                       # EOM(2)
         .byte   2                       # Abbreviation Code
@@ -34,6 +38,7 @@
         .long   .debug_abbrev           # Offset Into Abbrev. Section
         .quad   -6573227469967412476
         .byte   1                       # Abbrev [1]
+        .asciz  "test.cpp"
         .byte   0
 .Lcu_end0:
         .long   .Lcu_end1-.Lcu_start1 # Length of Unit
diff --git a/llvm/test/MC/COFF/cv-compiler-info-clamp.ll b/llvm/test/MC/COFF/cv-compiler-info-clamp.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/COFF/cv-compiler-info-clamp.ll
@@ -0,0 +1,61 @@
+; Check that a large version number (4.0.20211223) is clamped to proper size
+; RUN: llc -mtriple i686-pc-windows-msvc < %s | FileCheck %s --check-prefixes=CHECK,STDOUT
+; RUN: llc -mtriple i686-pc-windows-msvc < %s -o %t
+; RUN: FileCheck %s --input-file=%t --check-prefixes=CHECK,FILE
+
+; RUN: llvm-mc %t -triple=i686-pc-win32 -filetype=obj -o %t.obj
+; RUN: llvm-pdbutil dump -il -symbols %t.obj | FileCheck %s --check-prefixes=CODEVIEW
+
+; ModuleID = 'D:\src\scopes\foo.cpp'
+source_filename = "D:\5Csrc\5Cscopes\5Cfoo.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc19.0.23918"
+
+; Function Attrs: nounwind sspstrong
+define i32 @"\01?foo@@YAHXZ"() #0 !dbg !10 {
+entry:
+  ret i32 42, !dbg !14
+}
+
+attributes #0 = { nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.20211223 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+; One .debug$S section should contain an S_COMPILE3 record that identifies the
+; source language and the version of the compiler based on the DICompileUnit.
+; CHECK: 	.section	.debug$S,"dr"
+; CHECK:        .short  4353                    # Record kind: S_OBJNAME
+; CHECK-NEXT:   .long   0                       # Signature
+; STDOUT-NEXT:  .byte   0                       # Object name
+; FILE-NEXT:    .asciz	"{{.*}}{{\\\\|/}}cv-compiler-info-clamp.ll.tmp" # Object name
+; CHECK: 		.short	4412                    # Record kind: S_COMPILE3
+; CHECK-NEXT:   .long	1                       # Flags and language
+; CHECK-NEXT: 	.short	7                       # CPUType
+; CHECK-NEXT: 	.short	4                       # Frontend version
+; CHECK-NEXT: 	.short	0
+; CHECK-NEXT: 	.short	65535
+; CHECK-NEXT: 	.short	0
+; CHECK-NEXT: 	.short	[[BACKEND_VERSION:[0-9]+]]  # Backend version
+; CHECK-NEXT: 	.short	0
+; CHECK-NEXT: 	.short	0
+; CHECK-NEXT: 	.short	0
+; CHECK-NEXT: 	.asciz	"clang version 4.0.20211223 "  # Null-terminated compiler version string
+; CHECK-NOT:    .short	4412                  # Record kind: S_COMPILE3
+!1 = !DIFile(filename: "D:\5Csrc\5Cscopes\5Cfoo.cpp", directory: "D:\5Csrc\5Cscopes\5Cclang")
+!2 = !{}
+!7 = !{i32 2, !"CodeView", i32 1}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 4.0.20211223 "}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "\01?foo@@YAHXZ", scope: !1, file: !1, line: 1, type: !11, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!11 = !DISubroutineType(types: !12)
+!12 = !{!13}
+!13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!14 = !DILocation(line: 2, scope: !10)
+
+; CODEVIEW:      S_COMPILE3 [size = 56]
+; CODEVIEW-NEXT: machine = intel pentium 3, Ver = clang version 4.0.20211223 , language = c++
+; Backend version is based off of compiler version building this which is variable
+; CODEVIEW-NEXT:   frontend = 4.0.65535.0, backend = 
diff --git a/llvm/test/MC/Hexagon/arch-support.s b/llvm/test/MC/Hexagon/arch-support.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Hexagon/arch-support.s
@@ -0,0 +1,30 @@
+# RUN: llvm-mc -arch=hexagon -mv5 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V5 %s
+# RUN: llvm-mc -arch=hexagon -mv55 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V55 %s
+# RUN: llvm-mc -arch=hexagon -mv60 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V60 %s
+# RUN: llvm-mc -arch=hexagon -mv62 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V62 %s
+# RUN: llvm-mc -arch=hexagon -mv65 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V65 %s
+# RUN: llvm-mc -arch=hexagon -mv67 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V67 %s
+# RUN: llvm-mc -arch=hexagon -mv68 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V68 %s
+# RUN: llvm-mc -arch=hexagon -mv69 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V69 %s
+
+# RUN: llvm-mc -arch=hexagon -mv5 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -arch=hexagon -mv55 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -arch=hexagon -mv60 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -arch=hexagon -mv62 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -arch=hexagon -mv65 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -arch=hexagon -mv67 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -arch=hexagon -mv68 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -arch=hexagon -mv69 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+    .text
+r1 = r1
+
+# CHECK-V5: Flags:{{.*}}0x4
+# CHECK-V55: Flags:{{.*}}0x5
+# CHECK-V60: Flags:{{.*}}0x60
+# CHECK-V62: Flags:{{.*}}0x62
+# CHECK-V65: Flags:{{.*}}0x65
+# CHECK-V67: Flags:{{.*}}0x67
+# CHECK-V68: Flags:{{.*}}0x68
+# CHECK-V69: Flags:{{.*}}0x69
+
+# CHECK-OBJDUMP: { r1 = r1 }
diff --git a/llvm/test/MC/Hexagon/zreg-post-inc.s b/llvm/test/MC/Hexagon/zreg-post-inc.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Hexagon/zreg-post-inc.s
@@ -0,0 +1,8 @@
+# RUN: not llvm-mc -arch=hexagon -filetype=obj -mhvx -mcpu=hexagonv66 %s 2> %t; FileCheck --implicit-check-not=error %s <%t
+
+{
+  if (p0) memb(r14+#8)=r4.new
+  if (p0) z=vmem(r4++#0)
+}
+
+# CHECK: error: Instruction does not have a valid new register producer
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
 ; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=9 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=9 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=14 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
 ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
 
@@ -34,7 +34,7 @@
 ; IS________OPM-LABEL: define {{[^@]+}}@h2s_value_simplify_interaction
 ; IS________OPM-SAME: (i1 [[C:%.*]], i8* nocapture nofree readnone [[A:%.*]]) {
 ; IS________OPM-NEXT:  entry:
-; IS________OPM-NEXT:    [[M:%.*]] = tail call noalias i8* @malloc(i64 noundef 4)
+; IS________OPM-NEXT:    [[M:%.*]] = tail call noalias align 16 i8* @malloc(i64 noundef 4)
 ; IS________OPM-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS________OPM:       t:
 ; IS________OPM-NEXT:    br i1 false, label [[DEAD:%.*]], label [[F2:%.*]]
@@ -43,41 +43,41 @@
 ; IS________OPM:       f2:
 ; IS________OPM-NEXT:    [[C1:%.*]] = bitcast i8* [[M]] to i32*
 ; IS________OPM-NEXT:    [[C2:%.*]] = bitcast i32* [[C1]] to i8*
-; IS________OPM-NEXT:    [[L:%.*]] = load i8, i8* [[C2]], align 1
+; IS________OPM-NEXT:    [[L:%.*]] = load i8, i8* [[C2]], align 16
 ; IS________OPM-NEXT:    call void @usei8(i8 [[L]])
-; IS________OPM-NEXT:    call void @no_sync_func(i8* nocapture nofree noundef [[C2]]) #[[ATTR5:[0-9]+]]
+; IS________OPM-NEXT:    call void @no_sync_func(i8* nocapture nofree noundef align 16 [[C2]]) #[[ATTR5:[0-9]+]]
 ; IS________OPM-NEXT:    br label [[J]]
 ; IS________OPM:       dead:
 ; IS________OPM-NEXT:    unreachable
 ; IS________OPM:       j:
 ; IS________OPM-NEXT:    [[PHI:%.*]] = phi i8* [ [[M]], [[F]] ], [ null, [[F2]] ]
-; IS________OPM-NEXT:    tail call void @no_sync_func(i8* nocapture nofree noundef [[PHI]]) #[[ATTR5]]
+; IS________OPM-NEXT:    tail call void @no_sync_func(i8* nocapture nofree noundef align 16 [[PHI]]) #[[ATTR5]]
 ; IS________OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@h2s_value_simplify_interaction
 ; IS________NPM-SAME: (i1 [[C:%.*]], i8* nocapture nofree readnone [[A:%.*]]) {
 ; IS________NPM-NEXT:  entry:
-; IS________NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS________NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 16
 ; IS________NPM-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS________NPM:       t:
 ; IS________NPM-NEXT:    br i1 false, label [[DEAD:%.*]], label [[F2:%.*]]
 ; IS________NPM:       f:
 ; IS________NPM-NEXT:    br label [[J:%.*]]
 ; IS________NPM:       f2:
-; IS________NPM-NEXT:    [[L:%.*]] = load i8, i8* [[TMP0]], align 1
+; IS________NPM-NEXT:    [[L:%.*]] = load i8, i8* [[TMP0]], align 16
 ; IS________NPM-NEXT:    call void @usei8(i8 [[L]])
-; IS________NPM-NEXT:    call void @no_sync_func(i8* nocapture nofree noundef [[TMP0]]) #[[ATTR6:[0-9]+]]
+; IS________NPM-NEXT:    call void @no_sync_func(i8* nocapture nofree noundef align 16 [[TMP0]]) #[[ATTR6:[0-9]+]]
 ; IS________NPM-NEXT:    br label [[J]]
 ; IS________NPM:       dead:
 ; IS________NPM-NEXT:    unreachable
 ; IS________NPM:       j:
 ; IS________NPM-NEXT:    [[PHI:%.*]] = phi i8* [ [[TMP0]], [[F]] ], [ null, [[F2]] ]
-; IS________NPM-NEXT:    tail call void @no_sync_func(i8* nocapture nofree noundef [[PHI]]) #[[ATTR6]]
+; IS________NPM-NEXT:    tail call void @no_sync_func(i8* nocapture nofree noundef align 16 [[PHI]]) #[[ATTR6]]
 ; IS________NPM-NEXT:    ret void
 ;
 entry:
   %add = add i64 2, 2
-  %m = tail call noalias i8* @malloc(i64 %add)
+  %m = tail call align 16 noalias i8* @malloc(i64 %add)
   br i1 %c, label %t, label %f
 t:
   br i1 false, label %dead, label %f2
@@ -626,7 +626,6 @@
 ; IS________OPM:       8:
 ; IS________OPM-NEXT:    [[TMP9:%.*]] = call noalias i8* @malloc(i64 noundef 4)
 ; IS________OPM-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
-; IS________OPM-NEXT:    store i32 1, i32* [[TMP10]], align 8
 ; IS________OPM-NEXT:    br label [[TMP4]]
 ; IS________OPM:       11:
 ; IS________OPM-NEXT:    ret i32 5
@@ -646,7 +645,6 @@
 ; IS________NPM:       8:
 ; IS________NPM-NEXT:    [[TMP9:%.*]] = alloca i8, i64 4, align 1
 ; IS________NPM-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
-; IS________NPM-NEXT:    store i32 1, i32* [[TMP10]], align 8
 ; IS________NPM-NEXT:    br label [[TMP4]]
 ; IS________NPM:       11:
 ; IS________NPM-NEXT:    ret i32 5
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
--- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
@@ -494,7 +494,6 @@
 ; CHECK:       8:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call noalias i8* @malloc(i64 noundef 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
-; CHECK-NEXT:    store i32 1, i32* [[TMP10]], align 8
 ; CHECK-NEXT:    br label [[TMP4]]
 ; CHECK:       11:
 ; CHECK-NEXT:    ret i32 5
diff --git a/llvm/test/Transforms/Attributor/memory_locations.ll b/llvm/test/Transforms/Attributor/memory_locations.ll
--- a/llvm/test/Transforms/Attributor/memory_locations.ll
+++ b/llvm/test/Transforms/Attributor/memory_locations.ll
@@ -122,11 +122,18 @@
 
 define dso_local i8* @internal_only_rec_static_helper_malloc_noescape(i32 %arg) {
 ; FIXME: This is actually inaccessiblememonly because the malloced memory does not escape
-; CHECK-LABEL: define {{[^@]+}}@internal_only_rec_static_helper_malloc_noescape
-; CHECK-SAME: (i32 [[ARG:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = call noalias i8* @internal_only_rec_static_malloc_noescape(i32 [[ARG]])
-; CHECK-NEXT:    ret i8* [[CALL]]
+; IS__TUNIT____-LABEL: define {{[^@]+}}@internal_only_rec_static_helper_malloc_noescape
+; IS__TUNIT____-SAME: (i32 [[ARG:%.*]]) {
+; IS__TUNIT____-NEXT:  entry:
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call noalias i8* @internal_only_rec_static_malloc_noescape(i32 [[ARG]])
+; IS__TUNIT____-NEXT:    ret i8* [[CALL]]
+;
+; IS__CGSCC____: Function Attrs: inaccessiblememonly
+; IS__CGSCC____-LABEL: define {{[^@]+}}@internal_only_rec_static_helper_malloc_noescape
+; IS__CGSCC____-SAME: (i32 [[ARG:%.*]]) #[[ATTR0]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call noalias i8* @internal_only_rec_static_malloc_noescape(i32 [[ARG]])
+; IS__CGSCC____-NEXT:    ret i8* [[CALL]]
 ;
 entry:
   %call = call i8* @internal_only_rec_static_malloc_noescape(i32 %arg)
@@ -135,24 +142,42 @@
 
 define internal i8* @internal_only_rec_static_malloc_noescape(i32 %arg) {
 ; FIXME: This is actually inaccessiblememonly because the malloced memory does not escape
-; CHECK-LABEL: define {{[^@]+}}@internal_only_rec_static_malloc_noescape
-; CHECK-SAME: (i32 [[ARG:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[ARG]], 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REM]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[ARG]], 2
-; CHECK-NEXT:    [[CALL:%.*]] = call noalias i8* @internal_only_rec(i32 [[DIV]])
-; CHECK-NEXT:    br label [[RETURN:%.*]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[ARG]] to i64
-; CHECK-NEXT:    [[CALL1:%.*]] = call noalias i8* @malloc(i64 [[CONV]])
-; CHECK-NEXT:    store i8 0, i8* [[CALL1]], align 1
-; CHECK-NEXT:    br label [[RETURN]]
-; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i8* [ [[CALL]], [[IF_THEN]] ], [ null, [[IF_END]] ]
-; CHECK-NEXT:    ret i8* [[RETVAL_0]]
+; IS__TUNIT____-LABEL: define {{[^@]+}}@internal_only_rec_static_malloc_noescape
+; IS__TUNIT____-SAME: (i32 [[ARG:%.*]]) {
+; IS__TUNIT____-NEXT:  entry:
+; IS__TUNIT____-NEXT:    [[REM:%.*]] = srem i32 [[ARG]], 2
+; IS__TUNIT____-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REM]], 1
+; IS__TUNIT____-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; IS__TUNIT____:       if.then:
+; IS__TUNIT____-NEXT:    [[DIV:%.*]] = sdiv i32 [[ARG]], 2
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call noalias i8* @internal_only_rec(i32 [[DIV]])
+; IS__TUNIT____-NEXT:    br label [[RETURN:%.*]]
+; IS__TUNIT____:       if.end:
+; IS__TUNIT____-NEXT:    [[CONV:%.*]] = sext i32 [[ARG]] to i64
+; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call noalias i8* @malloc(i64 [[CONV]])
+; IS__TUNIT____-NEXT:    br label [[RETURN]]
+; IS__TUNIT____:       return:
+; IS__TUNIT____-NEXT:    [[RETVAL_0:%.*]] = phi i8* [ [[CALL]], [[IF_THEN]] ], [ null, [[IF_END]] ]
+; IS__TUNIT____-NEXT:    ret i8* [[RETVAL_0]]
+;
+; IS__CGSCC____: Function Attrs: inaccessiblememonly
+; IS__CGSCC____-LABEL: define {{[^@]+}}@internal_only_rec_static_malloc_noescape
+; IS__CGSCC____-SAME: (i32 [[ARG:%.*]]) #[[ATTR0]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    [[REM:%.*]] = srem i32 [[ARG]], 2
+; IS__CGSCC____-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REM]], 1
+; IS__CGSCC____-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; IS__CGSCC____:       if.then:
+; IS__CGSCC____-NEXT:    [[DIV:%.*]] = sdiv i32 [[ARG]], 2
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call noalias i8* @internal_only_rec(i32 [[DIV]])
+; IS__CGSCC____-NEXT:    br label [[RETURN:%.*]]
+; IS__CGSCC____:       if.end:
+; IS__CGSCC____-NEXT:    [[CONV:%.*]] = sext i32 [[ARG]] to i64
+; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call noalias i8* @malloc(i64 [[CONV]])
+; IS__CGSCC____-NEXT:    br label [[RETURN]]
+; IS__CGSCC____:       return:
+; IS__CGSCC____-NEXT:    [[RETVAL_0:%.*]] = phi i8* [ [[CALL]], [[IF_THEN]] ], [ null, [[IF_END]] ]
+; IS__CGSCC____-NEXT:    ret i8* [[RETVAL_0]]
 ;
 entry:
   %rem = srem i32 %arg, 2
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -54,6 +54,7 @@
 ; CHECK: @[[BYTES1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
 ; CHECK: @[[BYTES2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
 ; CHECK: @[[REC_STORAGE:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
+; CHECK: @[[GLOBAL:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_STY:%.*]] zeroinitializer, align 8
 ;.
 define void @write_arg(i32* %p, i32 %v) {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
@@ -115,7 +116,7 @@
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_S]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[I:%.*]] = bitcast %struct.S* [[S]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I]]) #[[ATTR9:[0-9]+]]
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I]]) #[[ATTR10:[0-9]+]]
 ; IS__TUNIT_OPM-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 3
 ; IS__TUNIT_OPM-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 4
 ; IS__TUNIT_OPM-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 5
@@ -142,7 +143,7 @@
 ; IS__TUNIT_OPM-NEXT:    [[I316:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 2
 ; IS__TUNIT_OPM-NEXT:    store i32 [[ADD15]], i32* [[I316]], align 4, !tbaa [[TBAA14:![0-9]+]]
 ; IS__TUNIT_OPM-NEXT:    [[I12:%.*]] = bitcast %struct.S* [[S]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I12]]) #[[ATTR9]]
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I12]]) #[[ATTR10]]
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
@@ -151,7 +152,7 @@
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_S]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[I:%.*]] = bitcast %struct.S* [[S]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I]]) #[[ATTR7:[0-9]+]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I]]) #[[ATTR9:[0-9]+]]
 ; IS__TUNIT_NPM-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 3
 ; IS__TUNIT_NPM-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 4
 ; IS__TUNIT_NPM-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 5
@@ -178,7 +179,7 @@
 ; IS__TUNIT_NPM-NEXT:    [[I316:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 2
 ; IS__TUNIT_NPM-NEXT:    store i32 [[ADD15]], i32* [[I316]], align 4, !tbaa [[TBAA14:![0-9]+]]
 ; IS__TUNIT_NPM-NEXT:    [[I12:%.*]] = bitcast %struct.S* [[S]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I12]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I12]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
@@ -187,16 +188,16 @@
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_S]], align 4
 ; IS__CGSCC_OPM-NEXT:    [[I:%.*]] = bitcast %struct.S* [[S]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I]]) #[[ATTR11:[0-9]+]]
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I]]) #[[ATTR13:[0-9]+]]
 ; IS__CGSCC_OPM-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 3
 ; IS__CGSCC_OPM-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 4
 ; IS__CGSCC_OPM-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 5
 ; IS__CGSCC_OPM-NEXT:    [[I1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 0
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(24) [[I1]], i32 noundef 1) #[[ATTR12:[0-9]+]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(24) [[I1]], i32 noundef 1) #[[ATTR14:[0-9]+]]
 ; IS__CGSCC_OPM-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 1
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR12]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR14]]
 ; IS__CGSCC_OPM-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 2
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR12]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR14]]
 ; IS__CGSCC_OPM-NEXT:    [[F12:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 3
 ; IS__CGSCC_OPM-NEXT:    store float 0x3FF19999A0000000, float* [[F12]], align 4, !tbaa [[TBAA7:![0-9]+]]
 ; IS__CGSCC_OPM-NEXT:    [[MUL:%.*]] = fmul float 0x40019999A0000000, 2.000000e+00
@@ -214,7 +215,7 @@
 ; IS__CGSCC_OPM-NEXT:    [[I316:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 2
 ; IS__CGSCC_OPM-NEXT:    store i32 [[ADD15]], i32* [[I316]], align 4, !tbaa [[TBAA14:![0-9]+]]
 ; IS__CGSCC_OPM-NEXT:    [[I12:%.*]] = bitcast %struct.S* [[S]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I12]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I12]]) #[[ATTR13]]
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
@@ -223,16 +224,16 @@
 ; IS__CGSCC_NPM-NEXT:  entry:
 ; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_S]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[I:%.*]] = bitcast %struct.S* [[S]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I]]) #[[ATTR9:[0-9]+]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I]]) #[[ATTR12:[0-9]+]]
 ; IS__CGSCC_NPM-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 3
 ; IS__CGSCC_NPM-NEXT:    [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 4
 ; IS__CGSCC_NPM-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 5
 ; IS__CGSCC_NPM-NEXT:    [[I1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 0
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(24) [[I1]], i32 noundef 1) #[[ATTR10:[0-9]+]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(24) [[I1]], i32 noundef 1) #[[ATTR13:[0-9]+]]
 ; IS__CGSCC_NPM-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 1
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR10]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR13]]
 ; IS__CGSCC_NPM-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 2
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR10]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR13]]
 ; IS__CGSCC_NPM-NEXT:    [[F12:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 3
 ; IS__CGSCC_NPM-NEXT:    store float 0x3FF19999A0000000, float* [[F12]], align 4, !tbaa [[TBAA7:![0-9]+]]
 ; IS__CGSCC_NPM-NEXT:    [[MUL:%.*]] = fmul float 0x40019999A0000000, 2.000000e+00
@@ -250,7 +251,7 @@
 ; IS__CGSCC_NPM-NEXT:    [[I316:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 2
 ; IS__CGSCC_NPM-NEXT:    store i32 [[ADD15]], i32* [[I316]], align 4, !tbaa [[TBAA14:![0-9]+]]
 ; IS__CGSCC_NPM-NEXT:    [[I12:%.*]] = bitcast %struct.S* [[S]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I12]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 24, i8* nocapture nofree noundef nonnull align 4 dereferenceable(24) [[I12]]) #[[ATTR12]]
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -411,7 +412,7 @@
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[BYTES:%.*]] = alloca [1024 x i8], align 16
 ; IS__TUNIT_NPM-NEXT:    [[I:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 0
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 1024, i8* nocapture nofree noundef nonnull align 16 dereferenceable(1024) [[I]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 1024, i8* nocapture nofree noundef nonnull align 16 dereferenceable(1024) [[I]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    br label [[FOR_COND:%.*]]
 ; IS__TUNIT_NPM:       for.cond:
 ; IS__TUNIT_NPM-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
@@ -481,7 +482,7 @@
 ; IS__TUNIT_NPM-NEXT:    br label [[FOR_COND28]], !llvm.loop [[LOOP20:![0-9]+]]
 ; IS__TUNIT_NPM:       for.end38:
 ; IS__TUNIT_NPM-NEXT:    [[I24:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 0
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 1024, i8* nocapture nofree noundef nonnull align 16 dereferenceable(1024) [[I24]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 1024, i8* nocapture nofree noundef nonnull align 16 dereferenceable(1024) [[I24]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@local_alloca_simplifiable_2() {
@@ -545,7 +546,7 @@
 ; IS__CGSCC_OPM-NEXT:    store i8 0, i8* [[ARRAYIDX25]], align 1, !tbaa [[TBAA15]]
 ; IS__CGSCC_OPM-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 500
 ; IS__CGSCC_OPM-NEXT:    [[I22:%.*]] = bitcast i8* [[ARRAYIDX26]] to i32*
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[I22]], i32 noundef 0) #[[ATTR13:[0-9]+]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[I22]], i32 noundef 0) #[[ATTR15:[0-9]+]]
 ; IS__CGSCC_OPM-NEXT:    br label [[FOR_COND28:%.*]]
 ; IS__CGSCC_OPM:       for.cond28:
 ; IS__CGSCC_OPM-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC36:%.*]] ], [ 0, [[FOR_END24]] ]
@@ -571,7 +572,7 @@
 ; IS__CGSCC_NPM-NEXT:  entry:
 ; IS__CGSCC_NPM-NEXT:    [[BYTES:%.*]] = alloca [1024 x i8], align 16
 ; IS__CGSCC_NPM-NEXT:    [[I:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 0
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 1024, i8* nocapture nofree noundef nonnull align 16 dereferenceable(1024) [[I]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 1024, i8* nocapture nofree noundef nonnull align 16 dereferenceable(1024) [[I]]) #[[ATTR12]]
 ; IS__CGSCC_NPM-NEXT:    br label [[FOR_COND:%.*]]
 ; IS__CGSCC_NPM:       for.cond:
 ; IS__CGSCC_NPM-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
@@ -628,7 +629,7 @@
 ; IS__CGSCC_NPM-NEXT:    store i8 0, i8* [[ARRAYIDX25]], align 1, !tbaa [[TBAA15]]
 ; IS__CGSCC_NPM-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BYTES]], i64 0, i64 500
 ; IS__CGSCC_NPM-NEXT:    [[I22:%.*]] = bitcast i8* [[ARRAYIDX26]] to i32*
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[I22]], i32 noundef 0) #[[ATTR11:[0-9]+]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[I22]], i32 noundef 0) #[[ATTR14:[0-9]+]]
 ; IS__CGSCC_NPM-NEXT:    br label [[FOR_COND28:%.*]]
 ; IS__CGSCC_NPM:       for.cond28:
 ; IS__CGSCC_NPM-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC36:%.*]] ], [ 0, [[FOR_END24]] ]
@@ -825,7 +826,7 @@
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    [[I:%.*]] = bitcast i32* [[L]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR10]]
 ; IS__TUNIT_OPM-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
 ; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
 ; IS__TUNIT_OPM:       cond.true:
@@ -834,7 +835,7 @@
 ; IS__TUNIT_OPM-NEXT:    br label [[COND_END]]
 ; IS__TUNIT_OPM:       cond.end:
 ; IS__TUNIT_OPM-NEXT:    [[I2:%.*]] = bitcast i32* [[L]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I2]]) #[[ATTR9]]
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I2]]) #[[ATTR10]]
 ; IS__TUNIT_OPM-NEXT:    ret i32 5
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind willreturn
@@ -843,7 +844,7 @@
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; IS__TUNIT_NPM-NEXT:    [[I:%.*]] = bitcast i32* [[L]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
 ; IS__TUNIT_NPM-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
 ; IS__TUNIT_NPM:       cond.true:
@@ -852,7 +853,7 @@
 ; IS__TUNIT_NPM-NEXT:    br label [[COND_END]]
 ; IS__TUNIT_NPM:       cond.end:
 ; IS__TUNIT_NPM-NEXT:    [[I2:%.*]] = bitcast i32* [[L]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I2]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I2]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    ret i32 5
 ;
 ; IS__CGSCC_OPM: Function Attrs: nofree nosync nounwind willreturn
@@ -861,7 +862,7 @@
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; IS__CGSCC_OPM-NEXT:    [[I:%.*]] = bitcast i32* [[L]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR13]]
 ; IS__CGSCC_OPM-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
 ; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
 ; IS__CGSCC_OPM:       cond.true:
@@ -870,7 +871,7 @@
 ; IS__CGSCC_OPM-NEXT:    br label [[COND_END]]
 ; IS__CGSCC_OPM:       cond.end:
 ; IS__CGSCC_OPM-NEXT:    [[I2:%.*]] = bitcast i32* [[L]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I2]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I2]]) #[[ATTR13]]
 ; IS__CGSCC_OPM-NEXT:    ret i32 5
 ;
 ; IS__CGSCC_NPM: Function Attrs: nofree nosync nounwind willreturn
@@ -879,7 +880,7 @@
 ; IS__CGSCC_NPM-NEXT:  entry:
 ; IS__CGSCC_NPM-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    [[I:%.*]] = bitcast i32* [[L]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR12]]
 ; IS__CGSCC_NPM-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
 ; IS__CGSCC_NPM-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
 ; IS__CGSCC_NPM:       cond.true:
@@ -888,7 +889,7 @@
 ; IS__CGSCC_NPM-NEXT:    br label [[COND_END]]
 ; IS__CGSCC_NPM:       cond.end:
 ; IS__CGSCC_NPM-NEXT:    [[I2:%.*]] = bitcast i32* [[L]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I2]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I2]]) #[[ATTR12]]
 ; IS__CGSCC_NPM-NEXT:    ret i32 5
 ;
 entry:
@@ -929,7 +930,7 @@
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    [[I:%.*]] = bitcast i32* [[L]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR10]]
 ; IS__TUNIT_OPM-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
 ; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
 ; IS__TUNIT_OPM:       cond.true:
@@ -938,7 +939,7 @@
 ; IS__TUNIT_OPM-NEXT:    br label [[COND_END]]
 ; IS__TUNIT_OPM:       cond.end:
 ; IS__TUNIT_OPM-NEXT:    [[I1:%.*]] = bitcast i32* [[L]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR9]]
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR10]]
 ; IS__TUNIT_OPM-NEXT:    ret i32 5
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind willreturn
@@ -947,7 +948,7 @@
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; IS__TUNIT_NPM-NEXT:    [[I:%.*]] = bitcast i32* [[L]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
 ; IS__TUNIT_NPM-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
 ; IS__TUNIT_NPM:       cond.true:
@@ -956,7 +957,7 @@
 ; IS__TUNIT_NPM-NEXT:    br label [[COND_END]]
 ; IS__TUNIT_NPM:       cond.end:
 ; IS__TUNIT_NPM-NEXT:    [[I1:%.*]] = bitcast i32* [[L]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    ret i32 5
 ;
 ; IS__CGSCC_OPM: Function Attrs: nofree nosync nounwind willreturn
@@ -965,7 +966,7 @@
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; IS__CGSCC_OPM-NEXT:    [[I:%.*]] = bitcast i32* [[L]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR13]]
 ; IS__CGSCC_OPM-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
 ; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
 ; IS__CGSCC_OPM:       cond.true:
@@ -974,7 +975,7 @@
 ; IS__CGSCC_OPM-NEXT:    br label [[COND_END]]
 ; IS__CGSCC_OPM:       cond.end:
 ; IS__CGSCC_OPM-NEXT:    [[I1:%.*]] = bitcast i32* [[L]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR13]]
 ; IS__CGSCC_OPM-NEXT:    ret i32 5
 ;
 ; IS__CGSCC_NPM: Function Attrs: nofree nosync nounwind willreturn
@@ -983,7 +984,7 @@
 ; IS__CGSCC_NPM-NEXT:  entry:
 ; IS__CGSCC_NPM-NEXT:    [[L:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    [[I:%.*]] = bitcast i32* [[L]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR12]]
 ; IS__CGSCC_NPM-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CND]], 0
 ; IS__CGSCC_NPM-NEXT:    br i1 [[TOBOOL_NOT]], label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
 ; IS__CGSCC_NPM:       cond.true:
@@ -992,7 +993,7 @@
 ; IS__CGSCC_NPM-NEXT:    br label [[COND_END]]
 ; IS__CGSCC_NPM:       cond.end:
 ; IS__CGSCC_NPM-NEXT:    [[I1:%.*]] = bitcast i32* [[L]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR12]]
 ; IS__CGSCC_NPM-NEXT:    ret i32 5
 ;
 entry:
@@ -1090,9 +1091,9 @@
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@static_global_simplifiable_1
 ; IS__CGSCC_OPM-SAME: (%struct.S* noalias nocapture nofree nonnull writeonly sret([[STRUCT_S:%.*]]) align 4 dereferenceable(24) [[AGG_RESULT:%.*]]) #[[ATTR4:[0-9]+]] {
 ; IS__CGSCC_OPM-NEXT:  entry:
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(24) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i32 0, i32 0), i32 noundef 1) #[[ATTR12]]
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i64 0, i32 1), i32 noundef 2) #[[ATTR12]]
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(16) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i64 0, i32 2), i32 noundef 3) #[[ATTR12]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(24) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i32 0, i32 0), i32 noundef 1) #[[ATTR14]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i64 0, i32 1), i32 noundef 2) #[[ATTR14]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(16) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i64 0, i32 2), i32 noundef 3) #[[ATTR14]]
 ; IS__CGSCC_OPM-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 3
 ; IS__CGSCC_OPM-NEXT:    store float 0x3FF19999A0000000, float* [[F1]], align 4, !tbaa [[TBAA7]]
 ; IS__CGSCC_OPM-NEXT:    [[MUL:%.*]] = fmul float 0x40019999A0000000, 2.000000e+00
@@ -1115,9 +1116,9 @@
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@static_global_simplifiable_1
 ; IS__CGSCC_NPM-SAME: (%struct.S* noalias nocapture nofree nonnull writeonly sret([[STRUCT_S:%.*]]) align 4 dereferenceable(24) [[AGG_RESULT:%.*]]) #[[ATTR4:[0-9]+]] {
 ; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(24) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i32 0, i32 0), i32 noundef 1) #[[ATTR10]]
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i64 0, i32 1), i32 noundef 2) #[[ATTR10]]
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(16) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i64 0, i32 2), i32 noundef 3) #[[ATTR10]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(24) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i32 0, i32 0), i32 noundef 1) #[[ATTR13]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i64 0, i32 1), i32 noundef 2) #[[ATTR13]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(16) getelementptr inbounds ([[STRUCT_S]], %struct.S* @Gs1, i64 0, i32 2), i32 noundef 3) #[[ATTR13]]
 ; IS__CGSCC_NPM-NEXT:    [[F1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 3
 ; IS__CGSCC_NPM-NEXT:    store float 0x3FF19999A0000000, float* [[F1]], align 4, !tbaa [[TBAA7]]
 ; IS__CGSCC_NPM-NEXT:    [[MUL:%.*]] = fmul float 0x40019999A0000000, 2.000000e+00
@@ -1377,7 +1378,7 @@
 ; IS__CGSCC_OPM-NEXT:    br label [[FOR_COND13]], !llvm.loop [[LOOP26:![0-9]+]]
 ; IS__CGSCC_OPM:       for.end23:
 ; IS__CGSCC_OPM-NEXT:    store i8 0, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @GBytes, i64 0, i64 1023), align 1, !tbaa [[TBAA15]]
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) bitcast (i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @GBytes, i64 0, i64 500) to i32*), i32 noundef 0) #[[ATTR13]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) bitcast (i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @GBytes, i64 0, i64 500) to i32*), i32 noundef 0) #[[ATTR15]]
 ; IS__CGSCC_OPM-NEXT:    br label [[FOR_COND25:%.*]]
 ; IS__CGSCC_OPM:       for.cond25:
 ; IS__CGSCC_OPM-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC33:%.*]] ], [ 0, [[FOR_END23]] ]
@@ -1450,7 +1451,7 @@
 ; IS__CGSCC_NPM-NEXT:    br label [[FOR_COND13]], !llvm.loop [[LOOP26:![0-9]+]]
 ; IS__CGSCC_NPM:       for.end23:
 ; IS__CGSCC_NPM-NEXT:    store i8 0, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @GBytes, i64 0, i64 1023), align 1, !tbaa [[TBAA15]]
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) bitcast (i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @GBytes, i64 0, i64 500) to i32*), i32 noundef 0) #[[ATTR11]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) bitcast (i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @GBytes, i64 0, i64 500) to i32*), i32 noundef 0) #[[ATTR14]]
 ; IS__CGSCC_NPM-NEXT:    br label [[FOR_COND25:%.*]]
 ; IS__CGSCC_NPM:       for.cond25:
 ; IS__CGSCC_NPM-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC33:%.*]] ], [ 0, [[FOR_END23]] ]
@@ -1724,11 +1725,11 @@
 ; IS__CGSCC_OPM-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 5
 ; IS__CGSCC_OPM-NEXT:    store float 0x400A666660000000, float* [[F3]], align 4, !tbaa [[TBAA11]]
 ; IS__CGSCC_OPM-NEXT:    [[I1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 0
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 8 dereferenceable(24) [[I1]], i32 noundef 1) #[[ATTR12]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 8 dereferenceable(24) [[I1]], i32 noundef 1) #[[ATTR14]]
 ; IS__CGSCC_OPM-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 1
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR12]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR14]]
 ; IS__CGSCC_OPM-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 2
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 8 dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR12]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 8 dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR14]]
 ; IS__CGSCC_OPM-NEXT:    [[F11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 3
 ; IS__CGSCC_OPM-NEXT:    [[I:%.*]] = load float, float* [[F11]], align 4, !tbaa [[TBAA7]]
 ; IS__CGSCC_OPM-NEXT:    [[F12:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 3
@@ -1774,11 +1775,11 @@
 ; IS__CGSCC_NPM-NEXT:    [[F3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 5
 ; IS__CGSCC_NPM-NEXT:    store float 0x400A666660000000, float* [[F3]], align 4, !tbaa [[TBAA11]]
 ; IS__CGSCC_NPM-NEXT:    [[I1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 0
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 8 dereferenceable(24) [[I1]], i32 noundef 1) #[[ATTR10]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 8 dereferenceable(24) [[I1]], i32 noundef 1) #[[ATTR13]]
 ; IS__CGSCC_NPM-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 1
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR10]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(20) [[I2]], i32 noundef 2) #[[ATTR13]]
 ; IS__CGSCC_NPM-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 2
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 8 dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR10]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nocapture nofree noundef nonnull writeonly align 8 dereferenceable(16) [[I3]], i32 noundef 3) #[[ATTR13]]
 ; IS__CGSCC_NPM-NEXT:    [[F11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S]], i64 0, i32 3
 ; IS__CGSCC_NPM-NEXT:    [[I:%.*]] = load float, float* [[F11]], align 4, !tbaa [[TBAA7]]
 ; IS__CGSCC_NPM-NEXT:    [[F12:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[AGG_RESULT]], i64 0, i32 3
@@ -2095,7 +2096,7 @@
 ; IS__CGSCC_OPM-NEXT:    store i8 0, i8* [[ARRAYIDX24]], align 1, !tbaa [[TBAA15]]
 ; IS__CGSCC_OPM-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[BYTES]], i64 500
 ; IS__CGSCC_OPM-NEXT:    [[I21:%.*]] = bitcast i8* [[ARRAYIDX25]] to i32*
-; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[I21]], i32 noundef 0) #[[ATTR13]]
+; IS__CGSCC_OPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[I21]], i32 noundef 0) #[[ATTR15]]
 ; IS__CGSCC_OPM-NEXT:    br label [[FOR_COND27:%.*]]
 ; IS__CGSCC_OPM:       for.cond27:
 ; IS__CGSCC_OPM-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC35:%.*]] ], [ 0, [[FOR_END23]] ]
@@ -2174,7 +2175,7 @@
 ; IS__CGSCC_NPM-NEXT:    store i8 0, i8* [[ARRAYIDX24]], align 1, !tbaa [[TBAA15]]
 ; IS__CGSCC_NPM-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[BYTES]], i64 500
 ; IS__CGSCC_NPM-NEXT:    [[I21:%.*]] = bitcast i8* [[ARRAYIDX25]] to i32*
-; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[I21]], i32 noundef 0) #[[ATTR11]]
+; IS__CGSCC_NPM-NEXT:    call void @write_arg(i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[I21]], i32 noundef 0) #[[ATTR14]]
 ; IS__CGSCC_NPM-NEXT:    br label [[FOR_COND27:%.*]]
 ; IS__CGSCC_NPM:       for.cond27:
 ; IS__CGSCC_NPM-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], [[FOR_INC35:%.*]] ], [ 0, [[FOR_END23]] ]
@@ -2307,9 +2308,9 @@
 ; IS__TUNIT_OPM-NEXT:    [[X:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    [[Y:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    [[I:%.*]] = bitcast i32* [[X]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR10]]
 ; IS__TUNIT_OPM-NEXT:    [[I1:%.*]] = bitcast i32* [[Y]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR9]]
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR10]]
 ; IS__TUNIT_OPM-NEXT:    store i32 1, i32* [[Y]], align 4, !tbaa [[TBAA3]]
 ; IS__TUNIT_OPM-NEXT:    store i32 1, i32* [[X]], align 4, !tbaa [[TBAA3]]
 ; IS__TUNIT_OPM-NEXT:    [[I2:%.*]] = bitcast i32* [[X]] to i8*
@@ -2332,9 +2333,9 @@
 ; IS__TUNIT_NPM-NEXT:    [[X:%.*]] = alloca i32, align 4
 ; IS__TUNIT_NPM-NEXT:    [[Y:%.*]] = alloca i32, align 4
 ; IS__TUNIT_NPM-NEXT:    [[I:%.*]] = bitcast i32* [[X]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    [[I1:%.*]] = bitcast i32* [[Y]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR7]]
+; IS__TUNIT_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR9]]
 ; IS__TUNIT_NPM-NEXT:    store i32 1, i32* [[Y]], align 4, !tbaa [[TBAA3]]
 ; IS__TUNIT_NPM-NEXT:    store i32 1, i32* [[X]], align 4, !tbaa [[TBAA3]]
 ; IS__TUNIT_NPM-NEXT:    [[I2:%.*]] = bitcast i32* [[X]] to i8*
@@ -2357,9 +2358,9 @@
 ; IS__CGSCC_OPM-NEXT:    [[X:%.*]] = alloca i32, align 4
 ; IS__CGSCC_OPM-NEXT:    [[Y:%.*]] = alloca i32, align 4
 ; IS__CGSCC_OPM-NEXT:    [[I:%.*]] = bitcast i32* [[X]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR13]]
 ; IS__CGSCC_OPM-NEXT:    [[I1:%.*]] = bitcast i32* [[Y]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR13]]
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[Y]], align 4, !tbaa [[TBAA3]]
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[X]], align 4, !tbaa [[TBAA3]]
 ; IS__CGSCC_OPM-NEXT:    [[I2:%.*]] = bitcast i32* [[X]] to i8*
@@ -2382,9 +2383,9 @@
 ; IS__CGSCC_NPM-NEXT:    [[X:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    [[Y:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    [[I:%.*]] = bitcast i32* [[X]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I]]) #[[ATTR12]]
 ; IS__CGSCC_NPM-NEXT:    [[I1:%.*]] = bitcast i32* [[Y]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[I1]]) #[[ATTR12]]
 ; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[Y]], align 4, !tbaa [[TBAA3]]
 ; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[X]], align 4, !tbaa [[TBAA3]]
 ; IS__CGSCC_NPM-NEXT:    [[I2:%.*]] = bitcast i32* [[X]] to i8*
@@ -3359,6 +3360,852 @@
   ret void
 }
 
+define dso_local i32 @round_trip_malloc(i32 %x) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@round_trip_malloc
+; IS__TUNIT_OPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 noundef 4) #[[ATTR11:[0-9]+]]
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* noundef [[TMP2]]) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@round_trip_malloc
+; IS__TUNIT_NPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__TUNIT_NPM-NEXT:    store i32 [[X]], i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP2]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@round_trip_malloc
+; IS__CGSCC_OPM-SAME: (i32 [[X:%.*]]) #[[ATTR11:[0-9]+]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 noundef 4) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* noundef [[TMP2]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@round_trip_malloc
+; IS__CGSCC_NPM-SAME: (i32 returned [[X:%.*]]) #[[ATTR9:[0-9]+]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__CGSCC_NPM-NEXT:    store i32 [[X]], i32* [[TMP1]], align 4
+; IS__CGSCC_NPM-NEXT:    ret i32 [[X]]
+;
+entry:
+  %call = call noalias i8* @malloc(i64 4) norecurse
+  %0 = bitcast i8* %call to i32*
+  store i32 %x, i32* %0, align 4
+  %1 = load i32, i32* %0, align 4
+  %2 = bitcast i32* %0 to i8*
+  call void @free(i8* %2) norecurse
+  ret i32 %1
+}
+
+define dso_local i32 @round_trip_malloc_constant() {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@round_trip_malloc_constant() {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 noundef 4) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_OPM-NEXT:    store i32 7, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* noundef [[TMP2]]) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@round_trip_malloc_constant() {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    ret i32 7
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@round_trip_malloc_constant
+; IS__CGSCC_OPM-SAME: () #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 noundef 4) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_OPM-NEXT:    store i32 7, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* noundef [[TMP2]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@round_trip_malloc_constant
+; IS__CGSCC_NPM-SAME: () #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    ret i32 7
+;
+entry:
+  %call = call noalias i8* @malloc(i64 4) norecurse
+  %0 = bitcast i8* %call to i32*
+  store i32 7, i32* %0, align 4
+  %1 = load i32, i32* %0, align 4
+  %2 = bitcast i32* %0 to i8*
+  call void @free(i8* %2) norecurse
+  ret i32 %1
+}
+
+declare noalias i8* @malloc(i64)
+
+declare void @free(i8*)
+
+define dso_local i32 @conditional_malloc(i32 %x) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@conditional_malloc
+; IS__TUNIT_OPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 noundef 4) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_OPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; IS__TUNIT_OPM:       if.then:
+; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    br label [[IF_END]]
+; IS__TUNIT_OPM:       if.end:
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@conditional_malloc
+; IS__TUNIT_NPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__TUNIT_NPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__TUNIT_NPM-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; IS__TUNIT_NPM:       if.then:
+; IS__TUNIT_NPM-NEXT:    store i32 [[X]], i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    br label [[IF_END]]
+; IS__TUNIT_NPM:       if.end:
+; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP2]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@conditional_malloc
+; IS__CGSCC_OPM-SAME: (i32 [[X:%.*]]) #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 noundef 4) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_OPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; IS__CGSCC_OPM:       if.then:
+; IS__CGSCC_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    br label [[IF_END]]
+; IS__CGSCC_OPM:       if.end:
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@conditional_malloc
+; IS__CGSCC_NPM-SAME: (i32 returned [[X:%.*]]) #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__CGSCC_NPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__CGSCC_NPM-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; IS__CGSCC_NPM:       if.then:
+; IS__CGSCC_NPM-NEXT:    store i32 [[X]], i32* [[TMP1]], align 4
+; IS__CGSCC_NPM-NEXT:    br label [[IF_END]]
+; IS__CGSCC_NPM:       if.end:
+; IS__CGSCC_NPM-NEXT:    ret i32 [[X]]
+;
+entry:
+  %call = call noalias i8* @malloc(i64 4) norecurse
+  %0 = bitcast i8* %call to i32*
+  %tobool = icmp ne i32 %x, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 %x, i32* %0, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+define dso_local i32 @round_trip_calloc(i32 %x) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@round_trip_calloc
+; IS__TUNIT_OPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @calloc(i64 noundef 4, i64 noundef 1) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@round_trip_calloc
+; IS__TUNIT_NPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__TUNIT_NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP0]] to i8*
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 4, i1 false)
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__TUNIT_NPM-NEXT:    store i32 [[X]], i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP2]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@round_trip_calloc
+; IS__CGSCC_OPM-SAME: (i32 [[X:%.*]]) #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @calloc(i64 noundef 4, i64 noundef 1) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@round_trip_calloc
+; IS__CGSCC_NPM-SAME: (i32 [[X:%.*]]) #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__CGSCC_NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP0]] to i8*
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 4, i1 false)
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__CGSCC_NPM-NEXT:    store i32 [[X]], i32* [[TMP1]], align 4
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %call = call noalias i8* @calloc(i64 4, i64 1) norecurse
+  %0 = bitcast i8* %call to i32*
+  store i32 %x, i32* %0, align 4
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+define dso_local i32 @round_trip_calloc_constant() {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@round_trip_calloc_constant() {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @calloc(i64 noundef 4, i64 noundef 1) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_OPM-NEXT:    store i32 11, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@round_trip_calloc_constant() {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__TUNIT_NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP0]] to i8*
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 4, i1 false)
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__TUNIT_NPM-NEXT:    store i32 11, i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP2]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@round_trip_calloc_constant
+; IS__CGSCC_OPM-SAME: () #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @calloc(i64 noundef 4, i64 noundef 1) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_OPM-NEXT:    store i32 11, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@round_trip_calloc_constant
+; IS__CGSCC_NPM-SAME: () #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__CGSCC_NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP0]] to i8*
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 4, i1 false)
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__CGSCC_NPM-NEXT:    store i32 11, i32* [[TMP1]], align 4
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %call = call noalias i8* @calloc(i64 4, i64 1) norecurse
+  %0 = bitcast i8* %call to i32*
+  store i32 11, i32* %0, align 4
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+declare noalias i8* @calloc(i64, i64)
+
+define dso_local i32 @conditional_calloc(i32 %x) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@conditional_calloc
+; IS__TUNIT_OPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @calloc(i64 noundef 1, i64 noundef 4) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_OPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__TUNIT_OPM:       if.then:
+; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    br label [[IF_END]]
+; IS__TUNIT_OPM:       if.end:
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* [[TMP2]]) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@conditional_calloc
+; IS__TUNIT_NPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__TUNIT_NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP0]] to i8*
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 4, i1 false)
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__TUNIT_NPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__TUNIT_NPM-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__TUNIT_NPM:       if.then:
+; IS__TUNIT_NPM-NEXT:    store i32 [[X]], i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    br label [[IF_END]]
+; IS__TUNIT_NPM:       if.end:
+; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP2]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@conditional_calloc
+; IS__CGSCC_OPM-SAME: (i32 [[X:%.*]]) #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @calloc(i64 noundef 1, i64 noundef 4) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_OPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__CGSCC_OPM:       if.then:
+; IS__CGSCC_OPM-NEXT:    store i32 [[X]], i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    br label [[IF_END]]
+; IS__CGSCC_OPM:       if.end:
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* [[TMP2]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@conditional_calloc
+; IS__CGSCC_NPM-SAME: (i32 [[X:%.*]]) #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__CGSCC_NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP0]] to i8*
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 4, i1 false)
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+; IS__CGSCC_NPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__CGSCC_NPM-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__CGSCC_NPM:       if.then:
+; IS__CGSCC_NPM-NEXT:    store i32 [[X]], i32* [[TMP1]], align 4
+; IS__CGSCC_NPM-NEXT:    br label [[IF_END]]
+; IS__CGSCC_NPM:       if.end:
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %call = call noalias i8* @calloc(i64 1, i64 4) norecurse
+  %0 = bitcast i8* %call to i32*
+  %tobool = icmp ne i32 %x, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 %x, i32* %0, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load i32, i32* %0, align 4
+  %2 = bitcast i32* %0 to i8*
+  call void @free(i8* %2) norecurse
+  ret i32 %1
+}
+
+define dso_local i32 @conditional_calloc_zero(i1 %c) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@conditional_calloc_zero
+; IS__TUNIT_OPM-SAME: (i1 [[C:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @calloc(i64 noundef 1, i64 noundef 4) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_OPM-NEXT:    br i1 [[C]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__TUNIT_OPM:       if.then:
+; IS__TUNIT_OPM-NEXT:    store i32 0, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    br label [[IF_END]]
+; IS__TUNIT_OPM:       if.end:
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* [[TMP2]]) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@conditional_calloc_zero
+; IS__TUNIT_NPM-SAME: (i1 [[C:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__TUNIT_NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP0]] to i8*
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 4, i1 false)
+; IS__TUNIT_NPM-NEXT:    br i1 [[C]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__TUNIT_NPM:       if.then:
+; IS__TUNIT_NPM-NEXT:    br label [[IF_END]]
+; IS__TUNIT_NPM:       if.end:
+; IS__TUNIT_NPM-NEXT:    ret i32 0
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@conditional_calloc_zero
+; IS__CGSCC_OPM-SAME: (i1 [[C:%.*]]) #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @calloc(i64 noundef 1, i64 noundef 4) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_OPM-NEXT:    br i1 [[C]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__CGSCC_OPM:       if.then:
+; IS__CGSCC_OPM-NEXT:    store i32 0, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    br label [[IF_END]]
+; IS__CGSCC_OPM:       if.end:
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to i8*
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* [[TMP2]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP1]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@conditional_calloc_zero
+; IS__CGSCC_NPM-SAME: (i1 [[C:%.*]]) #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; IS__CGSCC_NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP0]] to i8*
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 4, i1 false)
+; IS__CGSCC_NPM-NEXT:    br i1 [[C]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__CGSCC_NPM:       if.then:
+; IS__CGSCC_NPM-NEXT:    br label [[IF_END]]
+; IS__CGSCC_NPM:       if.end:
+; IS__CGSCC_NPM-NEXT:    ret i32 0
+;
+entry:
+  %call = call noalias i8* @calloc(i64 1, i64 4) norecurse
+  %0 = bitcast i8* %call to i32*
+  br i1 %c, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* %0, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load i32, i32* %0, align 4
+  %2 = bitcast i32* %0 to i8*
+  call void @free(i8* %2) norecurse
+  ret i32 %1
+}
+
+define dso_local i32* @malloc_like(i32 %s) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@malloc_like
+; IS__TUNIT_OPM-SAME: (i32 [[S:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CONV:%.*]] = sext i32 [[S]] to i64
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 [[CONV]]) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_OPM-NEXT:    ret i32* [[TMP0]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@malloc_like
+; IS__TUNIT_NPM-SAME: (i32 [[S:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[CONV:%.*]] = sext i32 [[S]] to i64
+; IS__TUNIT_NPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 [[CONV]]) #[[ATTR10:[0-9]+]]
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__TUNIT_NPM-NEXT:    ret i32* [[TMP0]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@malloc_like
+; IS__CGSCC_OPM-SAME: (i32 [[S:%.*]]) #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CONV:%.*]] = sext i32 [[S]] to i64
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 [[CONV]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_OPM-NEXT:    ret i32* [[TMP0]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@malloc_like
+; IS__CGSCC_NPM-SAME: (i32 [[S:%.*]]) #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[CONV:%.*]] = sext i32 [[S]] to i64
+; IS__CGSCC_NPM-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 [[CONV]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32*
+; IS__CGSCC_NPM-NEXT:    ret i32* [[TMP0]]
+;
+entry:
+  %conv = sext i32 %s to i64
+  %call = call noalias i8* @malloc(i64 %conv) norecurse
+  %0 = bitcast i8* %call to i32*
+  ret i32* %0
+}
+
+define dso_local i32 @round_trip_malloc_like(i32 %x) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@round_trip_malloc_like
+; IS__TUNIT_OPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32* @malloc_like(i32 noundef 4) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* noundef [[TMP1]]) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@round_trip_malloc_like
+; IS__TUNIT_NPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[CALL:%.*]] = call i32* @malloc_like(i32 noundef 4) #[[ATTR10]]
+; IS__TUNIT_NPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__TUNIT_NPM-NEXT:    call void @free(i8* noundef [[TMP1]]) #[[ATTR10]]
+; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@round_trip_malloc_like
+; IS__CGSCC_OPM-SAME: (i32 [[X:%.*]]) #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call i32* @malloc_like(i32 noundef 4) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* noundef [[TMP1]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@round_trip_malloc_like
+; IS__CGSCC_NPM-SAME: (i32 [[X:%.*]]) #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[CALL:%.*]] = call i32* @malloc_like(i32 noundef 4) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__CGSCC_NPM-NEXT:    call void @free(i8* noundef [[TMP1]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %call = call i32* @malloc_like(i32 4) norecurse
+  store i32 %x, i32* %call, align 4
+  %0 = load i32, i32* %call, align 4
+  %1 = bitcast i32* %call to i8*
+  call void @free(i8* %1) norecurse
+  ret i32 %0
+}
+
+define dso_local i32 @round_trip_unknown_alloc(i32 %x) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@round_trip_unknown_alloc
+; IS__TUNIT_OPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32* @unknown_alloc(i32 noundef 4) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* noundef [[TMP1]]) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@round_trip_unknown_alloc
+; IS__TUNIT_NPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[CALL:%.*]] = call i32* @unknown_alloc(i32 noundef 4) #[[ATTR10]]
+; IS__TUNIT_NPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__TUNIT_NPM-NEXT:    call void @free(i8* noundef [[TMP1]]) #[[ATTR10]]
+; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@round_trip_unknown_alloc
+; IS__CGSCC_OPM-SAME: (i32 [[X:%.*]]) #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call i32* @unknown_alloc(i32 noundef 4) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* noundef [[TMP1]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@round_trip_unknown_alloc
+; IS__CGSCC_NPM-SAME: (i32 [[X:%.*]]) #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[CALL:%.*]] = call i32* @unknown_alloc(i32 noundef 4) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__CGSCC_NPM-NEXT:    call void @free(i8* noundef [[TMP1]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %call = call i32* @unknown_alloc(i32 4) norecurse
+  store i32 %x, i32* %call, align 4
+  %0 = load i32, i32* %call, align 4
+  %1 = bitcast i32* %call to i8*
+  call void @free(i8* %1) norecurse
+  ret i32 %0
+}
+
+declare noalias i32* @unknown_alloc(i32)
+
+define dso_local i32 @conditional_unknown_alloc(i32 %x) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@conditional_unknown_alloc
+; IS__TUNIT_OPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias i32* @unknown_alloc(i32 noundef 4) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__TUNIT_OPM:       if.then:
+; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__TUNIT_OPM-NEXT:    br label [[IF_END]]
+; IS__TUNIT_OPM:       if.end:
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* [[TMP1]]) #[[ATTR11]]
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@conditional_unknown_alloc
+; IS__TUNIT_NPM-SAME: (i32 [[X:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[CALL:%.*]] = call noalias i32* @unknown_alloc(i32 noundef 4) #[[ATTR10]]
+; IS__TUNIT_NPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__TUNIT_NPM-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__TUNIT_NPM:       if.then:
+; IS__TUNIT_NPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__TUNIT_NPM-NEXT:    br label [[IF_END]]
+; IS__TUNIT_NPM:       if.end:
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__TUNIT_NPM-NEXT:    call void @free(i8* [[TMP1]]) #[[ATTR10]]
+; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__CGSCC_OPM: Function Attrs: norecurse
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@conditional_unknown_alloc
+; IS__CGSCC_OPM-SAME: (i32 [[X:%.*]]) #[[ATTR11]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias i32* @unknown_alloc(i32 noundef 4) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__CGSCC_OPM:       if.then:
+; IS__CGSCC_OPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__CGSCC_OPM-NEXT:    br label [[IF_END]]
+; IS__CGSCC_OPM:       if.end:
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* [[TMP1]]) #[[ATTR11]]
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP0]]
+;
+; IS__CGSCC_NPM: Function Attrs: norecurse
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@conditional_unknown_alloc
+; IS__CGSCC_NPM-SAME: (i32 [[X:%.*]]) #[[ATTR9]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[CALL:%.*]] = call noalias i32* @unknown_alloc(i32 noundef 4) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; IS__CGSCC_NPM-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; IS__CGSCC_NPM:       if.then:
+; IS__CGSCC_NPM-NEXT:    store i32 [[X]], i32* [[CALL]], align 4
+; IS__CGSCC_NPM-NEXT:    br label [[IF_END]]
+; IS__CGSCC_NPM:       if.end:
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[CALL]] to i8*
+; IS__CGSCC_NPM-NEXT:    call void @free(i8* [[TMP1]]) #[[ATTR9]]
+; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %call = call noalias i32* @unknown_alloc(i32 4) norecurse
+  %tobool = icmp ne i32 %x, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 %x, i32* %call, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %0 = load i32, i32* %call, align 4
+  %1 = bitcast i32* %call to i8*
+  call void @free(i8* %1) norecurse
+  ret i32 %0
+}
+
+%struct.STy = type { float*, double*, %struct.STy* }
+
+@global = internal global %struct.STy zeroinitializer, align 8
+
+; We mark %dst as writeonly and %src as readonly, that is (for now) all we can expect.
+define dso_local void @test_nested_memory(float* %dst, double* %src) {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test_nested_memory
+; IS__TUNIT_OPM-SAME: (float* nocapture nofree writeonly [[DST:%.*]], double* nocapture nofree readonly [[SRC:%.*]]) {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
+; IS__TUNIT_OPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[LOCAL]], i64 0, i32 2
+; IS__TUNIT_OPM-NEXT:    store %struct.STy* @global, %struct.STy** [[INNER]], align 8
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call noalias dereferenceable_or_null(24) i8* @malloc(i64 noundef 24)
+; IS__TUNIT_OPM-NEXT:    [[DST1:%.*]] = bitcast i8* [[CALL]] to float**
+; IS__TUNIT_OPM-NEXT:    store float* [[DST]], float** [[DST1]], align 8
+; IS__TUNIT_OPM-NEXT:    [[SRC2:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 8
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i8* [[SRC2]] to double**
+; IS__TUNIT_OPM-NEXT:    store double* [[SRC]], double** [[TMP0]], align 8
+; IS__TUNIT_OPM-NEXT:    store i8* [[CALL]], i8** bitcast (%struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2) to i8**), align 8
+; IS__TUNIT_OPM-NEXT:    call fastcc void @nested_memory_callee(%struct.STy* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(24) [[LOCAL]]) #[[ATTR12:[0-9]+]]
+; IS__TUNIT_OPM-NEXT:    ret void
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@test_nested_memory
+; IS__TUNIT_NPM-SAME: (float* nocapture nofree writeonly [[DST:%.*]], double* nocapture nofree readonly [[SRC:%.*]]) {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
+; IS__TUNIT_NPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[LOCAL]], i64 0, i32 2
+; IS__TUNIT_NPM-NEXT:    store %struct.STy* @global, %struct.STy** [[INNER]], align 8
+; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = alloca i8, i64 24, align 1
+; IS__TUNIT_NPM-NEXT:    [[DST1:%.*]] = bitcast i8* [[TMP0]] to float**
+; IS__TUNIT_NPM-NEXT:    store float* [[DST]], float** [[DST1]], align 8
+; IS__TUNIT_NPM-NEXT:    [[SRC2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SRC2]] to double**
+; IS__TUNIT_NPM-NEXT:    store double* [[SRC]], double** [[TMP1]], align 8
+; IS__TUNIT_NPM-NEXT:    store i8* [[TMP0]], i8** bitcast (%struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2) to i8**), align 8
+; IS__TUNIT_NPM-NEXT:    [[LOCAL_CAST:%.*]] = bitcast %struct.STy* [[LOCAL]] to float**
+; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = load float*, float** [[LOCAL_CAST]], align 8
+; IS__TUNIT_NPM-NEXT:    [[LOCAL_0_1:%.*]] = getelementptr [[STRUCT_STY]], %struct.STy* [[LOCAL]], i64 0, i32 1
+; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = load double*, double** [[LOCAL_0_1]], align 8
+; IS__TUNIT_NPM-NEXT:    [[LOCAL_0_2:%.*]] = getelementptr [[STRUCT_STY]], %struct.STy* [[LOCAL]], i64 0, i32 2
+; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load %struct.STy*, %struct.STy** [[LOCAL_0_2]], align 8
+; IS__TUNIT_NPM-NEXT:    call fastcc void @nested_memory_callee(float* [[TMP2]], double* [[TMP3]], %struct.STy* [[TMP4]]) #[[ATTR11:[0-9]+]]
+; IS__TUNIT_NPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test_nested_memory
+; IS__CGSCC_OPM-SAME: (float* nocapture nofree writeonly [[DST:%.*]], double* nocapture nofree readonly [[SRC:%.*]]) {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast %struct.STy* [[LOCAL]] to i8*
+; IS__CGSCC_OPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[LOCAL]], i64 0, i32 2
+; IS__CGSCC_OPM-NEXT:    store %struct.STy* @global, %struct.STy** [[INNER]], align 8
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call noalias dereferenceable_or_null(24) i8* @malloc(i64 noundef 24)
+; IS__CGSCC_OPM-NEXT:    [[DST1:%.*]] = bitcast i8* [[CALL]] to float**
+; IS__CGSCC_OPM-NEXT:    store float* [[DST]], float** [[DST1]], align 8
+; IS__CGSCC_OPM-NEXT:    [[SRC2:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 8
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SRC2]] to double**
+; IS__CGSCC_OPM-NEXT:    store double* [[SRC]], double** [[TMP1]], align 8
+; IS__CGSCC_OPM-NEXT:    store i8* [[CALL]], i8** bitcast (%struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2) to i8**), align 8
+; IS__CGSCC_OPM-NEXT:    call fastcc void @nested_memory_callee(%struct.STy* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(24) [[LOCAL]]) #[[ATTR16:[0-9]+]]
+; IS__CGSCC_OPM-NEXT:    ret void
+;
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@test_nested_memory
+; IS__CGSCC_NPM-SAME: (float* nocapture nofree writeonly [[DST:%.*]], double* nocapture nofree readonly [[SRC:%.*]]) {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[LOCAL:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
+; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = bitcast %struct.STy* [[LOCAL]] to i8*
+; IS__CGSCC_NPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[LOCAL]], i64 0, i32 2
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 24, align 1
+; IS__CGSCC_NPM-NEXT:    [[DST1:%.*]] = bitcast i8* [[TMP1]] to float**
+; IS__CGSCC_NPM-NEXT:    store float* [[DST]], float** [[DST1]], align 8
+; IS__CGSCC_NPM-NEXT:    [[SRC2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SRC2]] to double**
+; IS__CGSCC_NPM-NEXT:    store double* [[SRC]], double** [[TMP2]], align 8
+; IS__CGSCC_NPM-NEXT:    store i8* [[TMP1]], i8** bitcast (%struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2) to i8**), align 8
+; IS__CGSCC_NPM-NEXT:    call fastcc void @nested_memory_callee(float* noalias nocapture nofree nonnull readnone undef, double* noalias nocapture nofree nonnull readnone undef, %struct.STy* noalias nocapture nofree nonnull readnone align 8 dereferenceable(24) undef) #[[ATTR15:[0-9]+]]
+; IS__CGSCC_NPM-NEXT:    ret void
+;
+entry:
+  %local = alloca %struct.STy, align 8
+  %0 = bitcast %struct.STy* %local to i8*
+  %inner = getelementptr inbounds %struct.STy, %struct.STy* %local, i64 0, i32 2
+  store %struct.STy* @global, %struct.STy** %inner, align 8
+  %call = call noalias dereferenceable_or_null(24) i8* @malloc(i64 24) #4
+  %dst1 = bitcast i8* %call to float**
+  store float* %dst, float** %dst1, align 8
+  %src2 = getelementptr inbounds i8, i8* %call, i64 8
+  %1 = bitcast i8* %src2 to double**
+  store double* %src, double** %1, align 8
+  store i8* %call, i8** bitcast (%struct.STy** getelementptr inbounds (%struct.STy, %struct.STy* @global, i64 0, i32 2) to i8**), align 8
+  call fastcc void @nested_memory_callee(%struct.STy* nonnull %local)
+  ret void
+}
+
+define internal fastcc void @nested_memory_callee(%struct.STy* nocapture readonly %S) nofree norecurse nounwind uwtable {
+; IS__TUNIT_OPM: Function Attrs: nofree norecurse nosync nounwind uwtable willreturn
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@nested_memory_callee
+; IS__TUNIT_OPM-SAME: (%struct.STy* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(24) [[S:%.*]]) #[[ATTR9:[0-9]+]] {
+; IS__TUNIT_OPM-NEXT:  entry:
+; IS__TUNIT_OPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY:%.*]], %struct.STy* [[S]], i64 0, i32 2
+; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = load %struct.STy*, %struct.STy** [[INNER]], align 8
+; IS__TUNIT_OPM-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP0]], i64 0, i32 2
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load %struct.STy*, %struct.STy** [[INNER1]], align 8
+; IS__TUNIT_OPM-NEXT:    [[SRC:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP1]], i64 0, i32 1
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = load double*, double** [[SRC]], align 8
+; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
+; IS__TUNIT_OPM-NEXT:    [[CONV:%.*]] = fptrunc double [[TMP3]] to float
+; IS__TUNIT_OPM-NEXT:    [[DST:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP1]], i64 0, i32 0
+; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load float*, float** [[DST]], align 8
+; IS__TUNIT_OPM-NEXT:    store float [[CONV]], float* [[TMP4]], align 4
+; IS__TUNIT_OPM-NEXT:    ret void
+;
+; IS__TUNIT_NPM: Function Attrs: nofree norecurse nosync nounwind uwtable willreturn
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@nested_memory_callee
+; IS__TUNIT_NPM-SAME: (float* [[TMP0:%.*]], double* [[TMP1:%.*]], %struct.STy* [[TMP2:%.*]]) #[[ATTR7:[0-9]+]] {
+; IS__TUNIT_NPM-NEXT:  entry:
+; IS__TUNIT_NPM-NEXT:    [[S_PRIV:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
+; IS__TUNIT_NPM-NEXT:    [[S_PRIV_CAST:%.*]] = bitcast %struct.STy* [[S_PRIV]] to float**
+; IS__TUNIT_NPM-NEXT:    store float* [[TMP0]], float** [[S_PRIV_CAST]], align 8
+; IS__TUNIT_NPM-NEXT:    [[S_PRIV_0_1:%.*]] = getelementptr [[STRUCT_STY]], %struct.STy* [[S_PRIV]], i64 0, i32 1
+; IS__TUNIT_NPM-NEXT:    store double* [[TMP1]], double** [[S_PRIV_0_1]], align 8
+; IS__TUNIT_NPM-NEXT:    [[S_PRIV_0_2:%.*]] = getelementptr [[STRUCT_STY]], %struct.STy* [[S_PRIV]], i64 0, i32 2
+; IS__TUNIT_NPM-NEXT:    store %struct.STy* [[TMP2]], %struct.STy** [[S_PRIV_0_2]], align 8
+; IS__TUNIT_NPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[S_PRIV]], i64 0, i32 2
+; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = load %struct.STy*, %struct.STy** [[INNER]], align 8
+; IS__TUNIT_NPM-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP3]], i64 0, i32 2
+; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load %struct.STy*, %struct.STy** [[INNER1]], align 8
+; IS__TUNIT_NPM-NEXT:    [[SRC:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP4]], i64 0, i32 1
+; IS__TUNIT_NPM-NEXT:    [[TMP5:%.*]] = load double*, double** [[SRC]], align 8
+; IS__TUNIT_NPM-NEXT:    [[TMP6:%.*]] = load double, double* [[TMP5]], align 8
+; IS__TUNIT_NPM-NEXT:    [[CONV:%.*]] = fptrunc double [[TMP6]] to float
+; IS__TUNIT_NPM-NEXT:    [[DST:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP4]], i64 0, i32 0
+; IS__TUNIT_NPM-NEXT:    [[TMP7:%.*]] = load float*, float** [[DST]], align 8
+; IS__TUNIT_NPM-NEXT:    store float [[CONV]], float* [[TMP7]], align 4
+; IS__TUNIT_NPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind uwtable willreturn
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@nested_memory_callee
+; IS__CGSCC_OPM-SAME: (%struct.STy* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(24) [[S:%.*]]) #[[ATTR12:[0-9]+]] {
+; IS__CGSCC_OPM-NEXT:  entry:
+; IS__CGSCC_OPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY:%.*]], %struct.STy* [[S]], i64 0, i32 2
+; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = load %struct.STy*, %struct.STy** [[INNER]], align 8
+; IS__CGSCC_OPM-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP0]], i64 0, i32 2
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load %struct.STy*, %struct.STy** [[INNER1]], align 8
+; IS__CGSCC_OPM-NEXT:    [[SRC:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP1]], i64 0, i32 1
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = load double*, double** [[SRC]], align 8
+; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
+; IS__CGSCC_OPM-NEXT:    [[CONV:%.*]] = fptrunc double [[TMP3]] to float
+; IS__CGSCC_OPM-NEXT:    [[DST:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP1]], i64 0, i32 0
+; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load float*, float** [[DST]], align 8
+; IS__CGSCC_OPM-NEXT:    store float [[CONV]], float* [[TMP4]], align 4
+; IS__CGSCC_OPM-NEXT:    ret void
+;
+; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind uwtable willreturn
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@nested_memory_callee
+; IS__CGSCC_NPM-SAME: (float* noalias nocapture nofree nonnull readnone [[TMP0:%.*]], double* noalias nocapture nofree nonnull readnone [[TMP1:%.*]], %struct.STy* noalias nocapture nofree nonnull readnone align 8 dereferenceable(24) [[TMP2:%.*]]) #[[ATTR10:[0-9]+]] {
+; IS__CGSCC_NPM-NEXT:  entry:
+; IS__CGSCC_NPM-NEXT:    [[S_PRIV:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
+; IS__CGSCC_NPM-NEXT:    [[S_PRIV_CAST:%.*]] = bitcast %struct.STy* [[S_PRIV]] to float**
+; IS__CGSCC_NPM-NEXT:    [[S_PRIV_0_1:%.*]] = getelementptr [[STRUCT_STY]], %struct.STy* [[S_PRIV]], i64 0, i32 1
+; IS__CGSCC_NPM-NEXT:    [[S_PRIV_0_2:%.*]] = getelementptr [[STRUCT_STY]], %struct.STy* [[S_PRIV]], i64 0, i32 2
+; IS__CGSCC_NPM-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[S_PRIV]], i64 0, i32 2
+; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = load %struct.STy*, %struct.STy** [[INNER]], align 8
+; IS__CGSCC_NPM-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2
+; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load %struct.STy*, %struct.STy** getelementptr inbounds ([[STRUCT_STY]], %struct.STy* @global, i64 0, i32 2), align 8
+; IS__CGSCC_NPM-NEXT:    [[SRC:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP4]], i64 0, i32 1
+; IS__CGSCC_NPM-NEXT:    [[TMP5:%.*]] = load double*, double** [[SRC]], align 8
+; IS__CGSCC_NPM-NEXT:    [[TMP6:%.*]] = load double, double* [[TMP5]], align 8
+; IS__CGSCC_NPM-NEXT:    [[CONV:%.*]] = fptrunc double [[TMP6]] to float
+; IS__CGSCC_NPM-NEXT:    [[DST:%.*]] = getelementptr inbounds [[STRUCT_STY]], %struct.STy* [[TMP4]], i64 0, i32 0
+; IS__CGSCC_NPM-NEXT:    [[TMP7:%.*]] = load float*, float** [[DST]], align 8
+; IS__CGSCC_NPM-NEXT:    store float [[CONV]], float* [[TMP7]], align 4
+; IS__CGSCC_NPM-NEXT:    ret void
+;
+entry:
+  %inner = getelementptr inbounds %struct.STy, %struct.STy* %S, i64 0, i32 2
+  %0 = load %struct.STy*, %struct.STy** %inner, align 8
+  %inner1 = getelementptr inbounds %struct.STy, %struct.STy* %0, i64 0, i32 2
+  %1 = load %struct.STy*, %struct.STy** %inner1, align 8
+  %src = getelementptr inbounds %struct.STy, %struct.STy* %1, i64 0, i32 1
+  %2 = load double*, double** %src, align 8
+  %3 = load double, double* %2, align 8
+  %conv = fptrunc double %3 to float
+  %dst = getelementptr inbounds %struct.STy, %struct.STy* %1, i64 0, i32 0
+  %4 = load float*, float** %dst, align 8
+  store float %conv, float* %4, align 4
+  ret void
+}
+
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
@@ -3404,7 +4251,10 @@
 ; IS__TUNIT_OPM: attributes #[[ATTR6]] = { nofree nosync nounwind writeonly }
 ; IS__TUNIT_OPM: attributes #[[ATTR7]] = { nofree nosync nounwind readonly willreturn }
 ; IS__TUNIT_OPM: attributes #[[ATTR8]] = { nofree nosync nounwind readnone }
-; IS__TUNIT_OPM: attributes #[[ATTR9]] = { willreturn }
+; IS__TUNIT_OPM: attributes #[[ATTR9]] = { nofree norecurse nosync nounwind uwtable willreturn }
+; IS__TUNIT_OPM: attributes #[[ATTR10]] = { willreturn }
+; IS__TUNIT_OPM: attributes #[[ATTR11]] = { norecurse }
+; IS__TUNIT_OPM: attributes #[[ATTR12]] = { nounwind }
 ;.
 ; IS__TUNIT_NPM: attributes #[[ATTR0]] = { argmemonly nofree nosync nounwind willreturn writeonly }
 ; IS__TUNIT_NPM: attributes #[[ATTR1]] = { argmemonly nofree nosync nounwind willreturn }
@@ -3413,7 +4263,11 @@
 ; IS__TUNIT_NPM: attributes #[[ATTR4]] = { nofree nosync nounwind willreturn writeonly }
 ; IS__TUNIT_NPM: attributes #[[ATTR5]] = { nofree nosync nounwind readonly willreturn }
 ; IS__TUNIT_NPM: attributes #[[ATTR6]] = { nofree nosync nounwind writeonly }
-; IS__TUNIT_NPM: attributes #[[ATTR7]] = { willreturn }
+; IS__TUNIT_NPM: attributes #[[ATTR7]] = { nofree norecurse nosync nounwind uwtable willreturn }
+; IS__TUNIT_NPM: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nounwind willreturn writeonly }
+; IS__TUNIT_NPM: attributes #[[ATTR9]] = { willreturn }
+; IS__TUNIT_NPM: attributes #[[ATTR10]] = { norecurse }
+; IS__TUNIT_NPM: attributes #[[ATTR11]] = { nounwind }
 ;.
 ; IS__CGSCC_OPM: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind willreturn writeonly }
 ; IS__CGSCC_OPM: attributes #[[ATTR1]] = { argmemonly nofree nosync nounwind willreturn }
@@ -3426,9 +4280,12 @@
 ; IS__CGSCC_OPM: attributes #[[ATTR8]] = { nofree norecurse nosync nounwind readnone }
 ; IS__CGSCC_OPM: attributes #[[ATTR9]] = { nofree norecurse nosync nounwind }
 ; IS__CGSCC_OPM: attributes #[[ATTR10]] = { nofree norecurse nosync nounwind writeonly }
-; IS__CGSCC_OPM: attributes #[[ATTR11]] = { willreturn }
-; IS__CGSCC_OPM: attributes #[[ATTR12]] = { nounwind willreturn writeonly }
-; IS__CGSCC_OPM: attributes #[[ATTR13]] = { nounwind writeonly }
+; IS__CGSCC_OPM: attributes #[[ATTR11]] = { norecurse }
+; IS__CGSCC_OPM: attributes #[[ATTR12]] = { nofree norecurse nosync nounwind uwtable willreturn }
+; IS__CGSCC_OPM: attributes #[[ATTR13]] = { willreturn }
+; IS__CGSCC_OPM: attributes #[[ATTR14]] = { nounwind willreturn writeonly }
+; IS__CGSCC_OPM: attributes #[[ATTR15]] = { nounwind writeonly }
+; IS__CGSCC_OPM: attributes #[[ATTR16]] = { nounwind }
 ;.
 ; IS__CGSCC_NPM: attributes #[[ATTR0]] = { argmemonly nofree norecurse nosync nounwind willreturn writeonly }
 ; IS__CGSCC_NPM: attributes #[[ATTR1]] = { argmemonly nofree nosync nounwind willreturn }
@@ -3439,9 +4296,13 @@
 ; IS__CGSCC_NPM: attributes #[[ATTR6]] = { argmemonly nofree norecurse nosync nounwind willreturn }
 ; IS__CGSCC_NPM: attributes #[[ATTR7]] = { nofree norecurse nosync nounwind readonly willreturn }
 ; IS__CGSCC_NPM: attributes #[[ATTR8]] = { nofree norecurse nosync nounwind writeonly }
-; IS__CGSCC_NPM: attributes #[[ATTR9]] = { willreturn }
-; IS__CGSCC_NPM: attributes #[[ATTR10]] = { nounwind willreturn writeonly }
-; IS__CGSCC_NPM: attributes #[[ATTR11]] = { nounwind writeonly }
+; IS__CGSCC_NPM: attributes #[[ATTR9]] = { norecurse }
+; IS__CGSCC_NPM: attributes #[[ATTR10]] = { nofree norecurse nosync nounwind uwtable willreturn }
+; IS__CGSCC_NPM: attributes #[[ATTR11:[0-9]+]] = { argmemonly nofree nounwind willreturn writeonly }
+; IS__CGSCC_NPM: attributes #[[ATTR12]] = { willreturn }
+; IS__CGSCC_NPM: attributes #[[ATTR13]] = { nounwind willreturn writeonly }
+; IS__CGSCC_NPM: attributes #[[ATTR14]] = { nounwind writeonly }
+; IS__CGSCC_NPM: attributes #[[ATTR15]] = { nounwind }
 ;.
 ; IS__TUNIT____: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; IS__TUNIT____: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1}
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
@@ -0,0 +1,65 @@
+; Tests that coro-split will convert a call before coro.suspend to a musttail call
+; while the user of the coro.suspend is a icmpinst.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+
+define void @fakeresume1(i8*)  {
+entry:
+  ret void;
+}
+
+define void @f() #0 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %alloc = call i8* @malloc(i64 16) #3
+  %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %save = call token @llvm.coro.save(i8* null)
+
+  %init_suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %init_suspend, label %coro.end [
+    i8 0, label %await.ready
+    i8 1, label %coro.end
+  ]
+await.ready:
+  %save2 = call token @llvm.coro.save(i8* null)
+
+  call fastcc void @fakeresume1(i8* align 8 null)
+  %suspend = call i8 @llvm.coro.suspend(token %save2, i1 true)
+  %switch = icmp ult i8 %suspend, 2
+  br i1 %switch, label %cleanup, label %coro.end
+
+cleanup:
+  %free.handle = call i8* @llvm.coro.free(token %id, i8* %vFrame)
+  %.not = icmp eq i8* %free.handle, null
+  br i1 %.not, label %coro.end, label %coro.free
+
+coro.free:
+  call void @delete(i8* nonnull %free.handle) #2
+  br label %coro.end
+
+coro.end:
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+}
+
+; FIXME: The fakerresume1 here should be musttail call.
+; CHECK-LABEL: @f.resume(
+; CHECK-NOT: musttail call fastcc void @fakeresume1(
+
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #1
+declare i1 @llvm.coro.alloc(token) #2
+declare i64 @llvm.coro.size.i64() #3
+declare i8* @llvm.coro.begin(token, i8* writeonly) #2
+declare token @llvm.coro.save(i8*) #2
+declare i8* @llvm.coro.frame() #3
+declare i8 @llvm.coro.suspend(token, i1) #2
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #1
+declare i1 @llvm.coro.end(i8*, i1) #2
+declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #1
+declare i8* @malloc(i64)
+declare void @delete(i8* nonnull) #2
+
+attributes #0 = { "coroutine.presplit"="1" }
+attributes #1 = { argmemonly nounwind readonly }
+attributes #2 = { nounwind }
+attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
@@ -0,0 +1,62 @@
+; Tests that sinked lifetime markers wouldn't provent optimization
+; to convert a resuming call to a musttail call.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+
+declare void @fakeresume1(i64* align 8)
+
+define void @g() #0 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %alloc = call i8* @malloc(i64 16) #3
+  %alloc.var = alloca i8
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %alloc.var)
+  %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %save = call token @llvm.coro.save(i8* null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.suspend
+    i8 1, label %exit
+  ]
+await.suspend:
+  %save2 = call token @llvm.coro.save(i8* null)
+  call fastcc void @fakeresume1(i64* align 8 null)
+  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
+  switch i8 %suspend2, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  call void @consume(i8* %alloc.var)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %alloc.var)
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+}
+
+; FIXME: The fakeresume1 here should be marked as musttail.
+; Verify that in the resume part resume call is marked with musttail.
+; CHECK-LABEL: @g.resume(
+; CHECK-NOT: musttail call fastcc void @fakeresume1(i64* align 8 null)
+
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #1
+declare i1 @llvm.coro.alloc(token) #2
+declare i64 @llvm.coro.size.i64() #3
+declare i8* @llvm.coro.begin(token, i8* writeonly) #2
+declare token @llvm.coro.save(i8*) #2
+declare i8* @llvm.coro.frame() #3
+declare i8 @llvm.coro.suspend(token, i1) #2
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #1
+declare i1 @llvm.coro.end(i8*, i1) #2
+declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #1
+declare i8* @malloc(i64)
+declare void @consume(i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+
+attributes #0 = { "coroutine.presplit"="1" }
+attributes #1 = { argmemonly nounwind readonly }
+attributes #2 = { nounwind }
+attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
@@ -0,0 +1,116 @@
+; Tests that sinked lifetime markers wouldn't provent optimization
+; to convert a resuming call to a musttail call.
+; The difference between this and coro-split-musttail5.ll is that there is
+; an extra bitcast instruction in the path, which makes it harder to
+; optimize.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+
+declare void @fakeresume1(i64* align 8)
+
+define void @g() #0 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %alloc = call i8* @malloc(i64 16) #3
+  %alloc.var = alloca i64
+  %alloca.var.i8 = bitcast i64* %alloc.var to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %alloca.var.i8)
+  %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %save = call token @llvm.coro.save(i8* null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.suspend
+    i8 1, label %exit
+  ]
+await.suspend:
+  %save2 = call token @llvm.coro.save(i8* null)
+  call fastcc void @fakeresume1(i64* align 8 null)
+  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
+  switch i8 %suspend2, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  call void @consume(i64* %alloc.var)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %alloca.var.i8)
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+}
+
+; FIXME: The fakeresume1 here should be marked as musttail.
+; Verify that in the resume part resume call is marked with musttail.
+; CHECK-LABEL: @g.resume(
+; CHECK-NOT: musttail call fastcc void @fakeresume1(i64* align 8 null)
+
+; It has a cleanup bb.
+define void @f() #0 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %alloc = call i8* @malloc(i64 16) #3
+  %alloc.var = alloca i64
+  %alloca.var.i8 = bitcast i64* %alloc.var to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %alloca.var.i8)
+  %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %save = call token @llvm.coro.save(i8* null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.suspend
+    i8 1, label %exit
+  ]
+await.suspend:
+  %save2 = call token @llvm.coro.save(i8* null)
+  call fastcc void @fakeresume1(i64* align 8 null)
+  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
+  switch i8 %suspend2, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %cleanup
+  ]
+await.ready:
+  call void @consume(i64* %alloc.var)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %alloca.var.i8)
+  br label %exit
+
+cleanup:
+  %free.handle = call i8* @llvm.coro.free(token %id, i8* %vFrame)
+  %.not = icmp eq i8* %free.handle, null
+  br i1 %.not, label %exit, label %coro.free
+
+coro.free:
+  call void @delete(i8* nonnull %free.handle) #2
+  br label %exit
+
+exit:
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+}
+
+; FIXME: The fakeresume1 here should be marked as musttail.
+; Verify that in the resume part resume call is marked with musttail.
+; CHECK-LABEL: @f.resume(
+; CHECK-NOT: musttail call fastcc void @fakeresume1(i64* align 8 null)
+
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #1
+declare i1 @llvm.coro.alloc(token) #2
+declare i64 @llvm.coro.size.i64() #3
+declare i8* @llvm.coro.begin(token, i8* writeonly) #2
+declare token @llvm.coro.save(i8*) #2
+declare i8* @llvm.coro.frame() #3
+declare i8 @llvm.coro.suspend(token, i1) #2
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #1
+declare i1 @llvm.coro.end(i8*, i1) #2
+declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #1
+declare i8* @malloc(i64)
+declare void @delete(i8* nonnull) #2
+declare void @consume(i64*)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+
+attributes #0 = { "coroutine.presplit"="1" }
+attributes #1 = { argmemonly nounwind readonly }
+attributes #2 = { nounwind }
+attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
@@ -0,0 +1,118 @@
+; Tests that sinked lifetime markers wouldn't provent optimization
+; to convert a resuming call to a musttail call.
+; The difference between this and coro-split-musttail5.ll and coro-split-musttail5.ll
+; is that this contains dead instruction generated during the transformation,
+; which makes the optimization harder.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+
+declare void @fakeresume1(i64* align 8)
+
+define void @g() #0 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %alloc = call i8* @malloc(i64 16) #3
+  %alloc.var = alloca i64
+  %alloca.var.i8 = bitcast i64* %alloc.var to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %alloca.var.i8)
+  %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %save = call token @llvm.coro.save(i8* null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.suspend
+    i8 1, label %exit
+  ]
+await.suspend:
+  %save2 = call token @llvm.coro.save(i8* null)
+  call fastcc void @fakeresume1(i64* align 8 null)
+  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
+  switch i8 %suspend2, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  call void @consume(i64* %alloc.var)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %alloca.var.i8)
+  br label %exit
+exit:
+  %.unused = getelementptr inbounds i8, i8* %vFrame, i32 0
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+}
+
+; FIXME: The fakeresume1 here should be marked as musttail.
+; Verify that in the resume part resume call is marked with musttail.
+; CHECK-LABEL: @g.resume(
+; CHECK-NOT: musttail call fastcc void @fakeresume1(i64* align 8 null)
+
+; It has a cleanup bb.
+define void @f() #0 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %alloc = call i8* @malloc(i64 16) #3
+  %alloc.var = alloca i64
+  %alloca.var.i8 = bitcast i64* %alloc.var to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %alloca.var.i8)
+  %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %save = call token @llvm.coro.save(i8* null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.suspend
+    i8 1, label %exit
+  ]
+await.suspend:
+  %save2 = call token @llvm.coro.save(i8* null)
+  call fastcc void @fakeresume1(i64* align 8 null)
+  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
+  switch i8 %suspend2, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %cleanup
+  ]
+await.ready:
+  call void @consume(i64* %alloc.var)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %alloca.var.i8)
+  br label %exit
+
+cleanup:
+  %free.handle = call i8* @llvm.coro.free(token %id, i8* %vFrame)
+  %.not = icmp eq i8* %free.handle, null
+  br i1 %.not, label %exit, label %coro.free
+
+coro.free:
+  call void @delete(i8* nonnull %free.handle) #2
+  br label %exit
+
+exit:
+  %.unused = getelementptr inbounds i8, i8* %vFrame, i32 0
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+}
+
+; FIXME: The fakeresume1 here should be marked as musttail.
+; Verify that in the resume part resume call is marked with musttail.
+; CHECK-LABEL: @f.resume(
+; CHECK-NOT: musttail call fastcc void @fakeresume1(i64* align 8 null)
+
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #1
+declare i1 @llvm.coro.alloc(token) #2
+declare i64 @llvm.coro.size.i64() #3
+declare i8* @llvm.coro.begin(token, i8* writeonly) #2
+declare token @llvm.coro.save(i8*) #2
+declare i8* @llvm.coro.frame() #3
+declare i8 @llvm.coro.suspend(token, i1) #2
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #1
+declare i1 @llvm.coro.end(i8*, i1) #2
+declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #1
+declare i8* @malloc(i64)
+declare void @delete(i8* nonnull) #2
+declare void @consume(i64*)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+
+attributes #0 = { "coroutine.presplit"="1" }
+attributes #1 = { argmemonly nounwind readonly }
+attributes #2 = { nounwind }
+attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-opaque-pointers.ll b/llvm/test/Transforms/IndVarSimplify/lftr-opaque-pointers.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/lftr-opaque-pointers.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -indvars -opaque-pointers < %s | FileCheck %s
+
+target datalayout = "n8:16:32:64"
+
+@data = common global [240 x i8] zeroinitializer, align 16
+
+; Based on the test from lftr.ll
+define void @test_zext(ptr %a) {
+; CHECK-LABEL: @test_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[P_0:%.*]] = phi ptr [ getelementptr inbounds ([240 x i8], ptr @data, i64 0, i64 0), [[ENTRY:%.*]] ], [ [[T3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY]] ], [ [[T:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[T]] = getelementptr inbounds i8, ptr [[DOT0]], i64 1
+; CHECK-NEXT:    [[T2:%.*]] = load i8, ptr [[DOT0]], align 1
+; CHECK-NEXT:    [[T3]] = getelementptr inbounds i8, ptr [[P_0]], i64 1
+; CHECK-NEXT:    store i8 [[T2]], ptr [[P_0]], align 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[P_0]], getelementptr (i8, ptr @data, i64 239)
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %i.0 = phi i8 [ 0, %entry ], [ %t4, %loop ]
+  %p.0 = phi ptr [ getelementptr inbounds ([240 x i8], [240 x i8]* @data, i64 0, i64 0), %entry ], [ %t3, %loop ]
+  %.0 = phi ptr [ %a, %entry ], [ %t, %loop ]
+  %t = getelementptr inbounds i8, ptr %.0, i64 1
+  %t2 = load i8, ptr %.0, align 1
+  %t3 = getelementptr inbounds i8, ptr %p.0, i64 1
+  store i8 %t2, ptr %p.0, align 1
+  %t4 = add i8 %i.0, 1
+  %t5 = icmp ult i8 %t4, -16
+  br i1 %t5, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
--- a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
+++ b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
@@ -5,11 +5,10 @@
 
 ; This call should not get inlined, because it would make the callee_not_avx
 ; call ABI incompatible.
-; TODO: Currently gets inlined.
 define void @caller_avx() "target-features"="+avx" {
 ; CHECK-LABEL: define {{[^@]+}}@caller_avx
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @callee_not_avx(<4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+; CHECK-NEXT:    call void @caller_not_avx()
 ; CHECK-NEXT:    ret void
 ;
   call void @caller_not_avx()
@@ -17,6 +16,10 @@
 }
 
 define internal void @caller_not_avx() {
+; CHECK-LABEL: define {{[^@]+}}@caller_not_avx() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @callee_not_avx(<4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+; CHECK-NEXT:    ret void
+;
   call i64 @callee_not_avx(<4 x i64> <i64 0, i64 1, i64 2, i64 3>)
   ret void
 }
@@ -33,11 +36,10 @@
 
 ; This call also shouldn't be inlined, as we don't know whether callee_unknown
 ; is ABI compatible or not.
-; TODO: Currently gets inlined.
 define void @caller_avx2() "target-features"="+avx" {
 ; CHECK-LABEL: define {{[^@]+}}@caller_avx2
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @callee_unknown(<4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+; CHECK-NEXT:    call void @caller_not_avx2()
 ; CHECK-NEXT:    ret void
 ;
   call void @caller_not_avx2()
@@ -45,6 +47,10 @@
 }
 
 define internal void @caller_not_avx2() {
+; CHECK-LABEL: define {{[^@]+}}@caller_not_avx2() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @callee_unknown(<4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+; CHECK-NEXT:    ret void
+;
   call i64 @callee_unknown(<4 x i64> <i64 0, i64 1, i64 2, i64 3>)
   ret void
 }
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -151,11 +151,25 @@
 ;
   %t1 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 1
   %t3 = getelementptr { i32, i32 }, { i32, i32 }* %y, i32 0, i32 1
-  ;; seteq x, y
   %t4 = icmp eq i32* %t1, %t3
   ret i1 %t4
 }
 
+define i1 @test10_addrspacecast({ i32, i32 }* %x, { i32, i32 } addrspace(3)* %y) {
+; CHECK-LABEL: @test10_addrspacecast(
+; CHECK-NEXT:    [[T1:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[X:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr { i32, i32 }, { i32, i32 } addrspace(3)* [[Y:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[T3_C:%.*]] = addrspacecast i32 addrspace(3)* [[T3]] to i32*
+; CHECK-NEXT:    [[T4:%.*]] = icmp eq i32* [[T1]], [[T3_C]]
+; CHECK-NEXT:    ret i1 [[T4]]
+;
+  %t1 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 1
+  %t3 = getelementptr { i32, i32 }, { i32, i32 } addrspace(3)* %y, i32 0, i32 1
+  %t3.c = addrspacecast i32 addrspace(3)* %t3 to i32*
+  %t4 = icmp eq i32* %t1, %t3.c
+  ret i1 %t4
+}
+
 define i1 @test11({ i32, i32 }* %X) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[Q:%.*]] = icmp eq { i32, i32 }* [[X:%.*]], null
diff --git a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
--- a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
+++ b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
@@ -300,3 +300,28 @@
   %cmp = icmp eq i32** %gepi32, %cast
   ret i1 %cmp
 }
+
+define void @test_zero_offset_cycle({ i64, i64 }* %arg) {
+; CHECK-LABEL: @test_zero_offset_cycle(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    br i1 true, label [[LOOP]], label [[LOOP_CONT:%.*]]
+; CHECK:       loop.cont:
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  %gep = getelementptr inbounds { i64, i64 }, { i64, i64 }* %arg, i32 0, i32 1
+  %gep.int = ptrtoint i64* %gep to i32
+  br label %loop
+
+loop:
+  %phi = phi i32 [ %gep.int, %entry ], [ %gep.int2, %loop.cont ], [ %phi, %loop ]
+  %phi.ptr = inttoptr i32 %phi to i64*
+  %cmp = icmp eq i64* %gep, %phi.ptr
+  br i1 %cmp, label %loop, label %loop.cont
+
+loop.cont:
+  %gep.int2 = ptrtoint i64* %gep to i32
+  br label %loop
+}
diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll
--- a/llvm/test/Transforms/InstCombine/lshr.ll
+++ b/llvm/test/Transforms/InstCombine/lshr.ll
@@ -162,6 +162,64 @@
   ret <2 x i8> %lshr
 }
 
+define i8 @shl_add(i8 %x, i8 %y) {
+; CHECK-LABEL: @shl_add(
+; CHECK-NEXT:    [[L:%.*]] = shl i8 [[X:%.*]], 2
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[L]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = lshr i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %l = shl i8 %x, 2
+  %a = add i8 %l, %y
+  %r = lshr i8 %a, 2
+  ret i8 %r
+}
+
+define <2 x i8> @shl_add_commute_vec(<2 x i8> %x, <2 x i8> %py) {
+; CHECK-LABEL: @shl_add_commute_vec(
+; CHECK-NEXT:    [[Y:%.*]] = mul <2 x i8> [[PY:%.*]], [[PY]]
+; CHECK-NEXT:    [[L:%.*]] = shl <2 x i8> [[X:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[Y]], [[L]]
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i8> [[A]], <i8 3, i8 3>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %y = mul <2 x i8> %py, %py ; thwart complexity-based canonicalization
+  %l = shl <2 x i8> %x, <i8 3, i8 3>
+  %a = add <2 x i8> %y, %l
+  %r = lshr <2 x i8> %a, <i8 3, i8 3>
+  ret <2 x i8> %r
+}
+
+define i32 @shl_add_use1(i32 %x, i32 %y) {
+; CHECK-LABEL: @shl_add_use1(
+; CHECK-NEXT:    [[L:%.*]] = shl i32 [[X:%.*]], 2
+; CHECK-NEXT:    call void @use(i32 [[L]])
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[L]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %l = shl i32 %x, 2
+  call void @use(i32 %l)
+  %a = add i32 %l, %y
+  %r = lshr i32 %a, 2
+  ret i32 %r
+}
+
+define i32 @shl_add_use2(i32 %x, i32 %y) {
+; CHECK-LABEL: @shl_add_use2(
+; CHECK-NEXT:    [[L:%.*]] = shl i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[L]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i32 [[A]])
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %l = shl i32 %x, 2
+  %a = add i32 %l, %y
+  call void @use(i32 %a)
+  %r = lshr i32 %a, 2
+  ret i32 %r
+}
+
 define i16 @bool_zext(i1 %x) {
 ; CHECK-LABEL: @bool_zext(
 ; CHECK-NEXT:    [[HIBIT:%.*]] = zext i1 [[X:%.*]] to i16
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -2129,3 +2129,15 @@
   %r = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %a, <3 x i8> <i8 13, i8 130, i8 130>)
   ret <3 x i8> %r
 }
+
+; Issue #52884 - this would assert because of a failure to simplify.
+
+define i8 @smax_offset_simplify(i8 %x) {
+; CHECK-LABEL: @smax_offset_simplify(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i8 [[X:%.*]], 50
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = add nuw nsw i8 50, %x
+  %m = call i8 @llvm.smax.i8(i8 %1, i8 -124)
+  ret i8 %m
+}
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -161,3 +161,116 @@
   call void (...) @varargs(i8* byval(i8) %b)
   ret void
 }
+
+define ptr @geps_combinable(ptr %a) {
+; CHECK-LABEL: @geps_combinable(
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr { i32, { i32, i32 } }, ptr [[A:%.*]], i64 0, i32 1, i32 1
+; CHECK-NEXT:    ret ptr [[A3]]
+;
+  %a2 = getelementptr { i32, { i32, i32 } }, ptr %a, i32 0, i32 1
+  %a3 = getelementptr { i32, i32 }, ptr %a2, i32 0, i32 1
+  ret ptr %a3
+}
+
+define ptr @geps_not_combinable(ptr %a) {
+; CHECK-LABEL: @geps_not_combinable(
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr { i32, i32 }, ptr [[A2]], i64 0, i32 1
+; CHECK-NEXT:    ret ptr [[A3]]
+;
+  %a2 = getelementptr { i32, i32 }, ptr %a, i32 0, i32 1
+  %a3 = getelementptr { i32, i32 }, ptr %a2, i32 0, i32 1
+  ret ptr %a3
+}
+
+define i1 @compare_geps_same_indices(ptr %a, ptr %b, i64 %idx) {
+; CHECK-LABEL: @compare_geps_same_indices(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %a2 = getelementptr i32, ptr %a, i64 %idx
+  %b2 = getelementptr i32, ptr %b, i64 %idx
+  %c = icmp eq ptr %a2, %b2
+  ret i1 %c
+}
+
+define i1 @compare_geps_same_indices_different_types(ptr %a, ptr %b, i64 %idx) {
+; CHECK-LABEL: @compare_geps_same_indices_different_types(
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[A2]], [[B2]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %a2 = getelementptr i32, ptr %a, i64 %idx
+  %b2 = getelementptr i64, ptr %b, i64 %idx
+  %c = icmp eq ptr %a2, %b2
+  ret i1 %c
+}
+
+define ptr @indexed_compare(ptr %A, i64 %offset) {
+; CHECK-LABEL: @indexed_compare(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i64 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[OFFSET:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RHS_ADD]] = add nsw i64 [[RHS_IDX]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[RHS_IDX]], 100
+; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[RHS_IDX]]
+; CHECK-NEXT:    ret ptr [[RHS_PTR]]
+;
+entry:
+  %tmp = getelementptr inbounds i32, ptr %A, i64 %offset
+  br label %bb
+
+bb:
+  %RHS = phi ptr [ %RHS.next, %bb ], [ %tmp, %entry ]
+  %LHS = getelementptr inbounds i32, ptr %A, i32 100
+  %RHS.next = getelementptr inbounds i32, ptr %RHS, i64 1
+  %cond = icmp ult ptr %LHS, %RHS
+  br i1 %cond, label %bb2, label %bb
+
+bb2:
+  ret ptr %RHS
+}
+
+define ptr @indexed_compare_different_types(ptr %A, i64 %offset) {
+; CHECK-LABEL: @indexed_compare_different_types(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET:%.*]]
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[RHS:%.*]] = phi ptr [ [[RHS_NEXT:%.*]], [[BB]] ], [ [[TMP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LHS:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 100
+; CHECK-NEXT:    [[RHS_NEXT]] = getelementptr inbounds i32, ptr [[RHS]], i64 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult ptr [[LHS]], [[RHS]]
+; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret ptr [[RHS]]
+;
+entry:
+  %tmp = getelementptr inbounds i32, ptr %A, i64 %offset
+  br label %bb
+
+bb:
+  %RHS = phi ptr [ %RHS.next, %bb ], [ %tmp, %entry ]
+  %LHS = getelementptr inbounds i64, ptr %A, i32 100
+  %RHS.next = getelementptr inbounds i32, ptr %RHS, i64 1
+  %cond = icmp ult ptr %LHS, %RHS
+  br i1 %cond, label %bb2, label %bb
+
+bb2:
+  ret ptr %RHS
+}
+
+define ptr addrspace(1) @gep_of_addrspace_cast(ptr %ptr) {
+; CHECK-LABEL: @gep_of_addrspace_cast(
+; CHECK-NEXT:    [[CAST1:%.*]] = addrspacecast ptr [[PTR:%.*]] to ptr addrspace(1)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[CAST1]], i64 1
+; CHECK-NEXT:    ret ptr addrspace(1) [[GEP]]
+;
+  %cast1 = addrspacecast ptr %ptr to ptr addrspace(1)
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %cast1, i64 1
+  ret ptr addrspace(1) %gep
+}
diff --git a/llvm/test/Transforms/InstCombine/shuffle_select.ll b/llvm/test/Transforms/InstCombine/shuffle_select.ll
--- a/llvm/test/Transforms/InstCombine/shuffle_select.ll
+++ b/llvm/test/Transforms/InstCombine/shuffle_select.ll
@@ -1060,15 +1060,30 @@
 define <4 x i32> @or_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @or_2_vars(
 ; CHECK-NEXT:    [[T1:%.*]] = or <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T1]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], <i32 5, i32 6, i32 3, i32 4>
-; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T1]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %t1 = or <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  call void @use_v4i32(<4 x i32> %t1)
   %t2 = or <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
   %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @or_2_vars_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @or_2_vars_undef_mask_elt(
+; CHECK-NEXT:    [[T1:%.*]] = or <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], <i32 5, i32 6, i32 3, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %t1 = or <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
   call void @use_v4i32(<4 x i32> %t1)
+  %t2 = or <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 undef>
   ret <4 x i32> %t3
 }
 
@@ -1077,17 +1092,17 @@
 define <4 x i32> @xor_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @xor_2_vars(
 ; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    [[T2:%.*]] = xor <4 x i32> [[V1:%.*]], <i32 5, i32 6, i32 7, i32 8>
-; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = xor <4 x i32> [[V1:%.*]], <i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[T3]]
 ;
   %t1 = xor <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
-  %t2 = xor <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
-  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   call void @use_v4i32(<4 x i32> %t1)
+  %t2 = xor <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
   call void @use_v4i32(<4 x i32> %t2)
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   ret <4 x i32> %t3
 }
 
diff --git a/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check.ll b/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check.ll
--- a/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check.ll
+++ b/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check.ll
@@ -197,3 +197,159 @@
   %r = icmp sge i8 %t0, %y
   ret i1 %r
 }
+
+define i1 @low_bitmask_ult(i8 %x) {
+; CHECK-LABEL: @low_bitmask_ult(
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %a = add i8 %x, 31
+  %m = and i8 %a, 31
+  %r = icmp ult i8 %m, %x
+  ret i1 %r
+}
+
+define <2 x i1> @low_bitmask_uge(<2 x i8> %x) {
+; CHECK-LABEL: @low_bitmask_uge(
+; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %a = add <2 x i8> %x, <i8 15, i8 undef>
+  %m = and <2 x i8> %a, <i8 15, i8 15>
+  %r = icmp uge <2 x i8> %m, %x
+  ret <2 x i1> %r
+}
+
+define i1 @low_bitmask_ugt(i8 %px) {
+; CHECK-LABEL: @low_bitmask_ugt(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[PX:%.*]], [[PX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = mul i8 %px, %px
+  %a = add i8 %x, 127
+  %m = and i8 %a, 127
+  %r = icmp ugt i8 %x, %m
+  ret i1 %r
+}
+
+define <2 x i1> @low_bitmask_ule(<2 x i8> %px) {
+; CHECK-LABEL: @low_bitmask_ule(
+; CHECK-NEXT:    [[X:%.*]] = mul <2 x i8> [[PX:%.*]], [[PX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[X]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %x = mul <2 x i8> %px, %px
+  %a = add <2 x i8> %x, <i8 3, i8 3>
+  %m = and <2 x i8> %a, <i8 3, i8 3>
+  %r = icmp ule <2 x i8> %x, %m
+  ret <2 x i1> %r
+}
+
+define i1 @low_bitmask_ult_use(i8 %x) {
+; CHECK-LABEL: @low_bitmask_ult_use(
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[X:%.*]], 7
+; CHECK-NEXT:    [[M:%.*]] = and i8 [[A]], 7
+; CHECK-NEXT:    call void @use8(i8 [[M]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %a = add i8 %x, 7
+  %m = and i8 %a, 7
+  call void @use8(i8 %m)
+  %r = icmp ult i8 %m, %x
+  ret i1 %r
+}
+
+define i1 @low_bitmask_ugt_use(i8 %px) {
+; CHECK-LABEL: @low_bitmask_ugt_use(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[PX:%.*]], [[PX]]
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[X]], 3
+; CHECK-NEXT:    call void @use8(i8 [[A]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = mul i8 %px, %px
+  %a = add i8 %x, 3
+  call void @use8(i8 %a)
+  %m = and i8 %a, 3
+  %r = icmp ugt i8 %x, %m
+  ret i1 %r
+}
+
+; negative test - need same low bitmask
+
+define i1 @low_bitmask_ult_wrong_mask1(i8 %x) {
+; CHECK-LABEL: @low_bitmask_ult_wrong_mask1(
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[X:%.*]], 30
+; CHECK-NEXT:    [[M:%.*]] = and i8 [[A]], 31
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[M]], [[X]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %a = add i8 %x, 30
+  %m = and i8 %a, 31
+  %r = icmp ult i8 %m, %x
+  ret i1 %r
+}
+
+; negative test - need same low bitmask
+
+define i1 @low_bitmask_uge_wrong_mask2(i8 %x) {
+; CHECK-LABEL: @low_bitmask_uge_wrong_mask2(
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[X:%.*]], 31
+; CHECK-NEXT:    [[M:%.*]] = and i8 [[A]], 63
+; CHECK-NEXT:    [[R:%.*]] = icmp uge i8 [[M]], [[X]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %a = add i8 %x, 31
+  %m = and i8 %a, 63
+  %r = icmp uge i8 %m, %x
+  ret i1 %r
+}
+
+; negative test - predicate mandates operand order
+
+define i1 @low_bitmask_ugt_swapped(i8 %x) {
+; CHECK-LABEL: @low_bitmask_ugt_swapped(
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[X:%.*]], 127
+; CHECK-NEXT:    [[M:%.*]] = and i8 [[A]], 127
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[M]], [[X]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %a = add i8 %x, 127
+  %m = and i8 %a, 127
+  %r = icmp ugt i8 %m, %x
+  ret i1 %r
+}
+
+; negative test - unsigned preds only
+
+define i1 @low_bitmask_sgt(i8 %px) {
+; CHECK-LABEL: @low_bitmask_sgt(
+; CHECK-NEXT:    [[X:%.*]] = mul i8 [[PX:%.*]], [[PX]]
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[X]], 127
+; CHECK-NEXT:    [[M:%.*]] = and i8 [[A]], 127
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i8 [[X]], [[M]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = mul i8 %px, %px
+  %a = add i8 %x, 127
+  %m = and i8 %a, 127
+  %r = icmp sgt i8 %x, %m
+  ret i1 %r
+}
+
+; negative test - specific operand must match
+
+define i1 @low_bitmask_ult_specific_op(i8 %x, i8 %y) {
+; CHECK-LABEL: @low_bitmask_ult_specific_op(
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[X:%.*]], 31
+; CHECK-NEXT:    [[M:%.*]] = and i8 [[A]], 31
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[M]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %a = add i8 %x, 31
+  %m = and i8 %a, 31
+  %r = icmp ult i8 %m, %y
+  ret i1 %r
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-null.ll b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
rename from llvm/test/Transforms/InstSimplify/ConstProp/icmp-null.ll
rename to llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
--- a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-null.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
@@ -215,3 +215,37 @@
   %cmp = icmp sgt [2 x i32]* %gep, @g
   ret i1 %cmp
 }
+
+define i1 @global_gep_ugt_global_gep() {
+; CHECK-LABEL: @global_gep_ugt_global_gep(
+; CHECK-NEXT:    ret i1 true
+;
+  %gep1 = getelementptr inbounds [2 x i32], [2 x i32]* @g, i64 0, i64 0
+  %gep2 = getelementptr inbounds [2 x i32], [2 x i32]* @g, i64 0, i64 1
+  %cmp = icmp ugt i32* %gep2, %gep1
+  ret i1 %cmp
+}
+
+; Should not fold due to signed comparison.
+define i1 @global_gep_sgt_global_gep() {
+; CHECK-LABEL: @global_gep_sgt_global_gep(
+; CHECK-NEXT:    ret i1 icmp sgt (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @g, i64 0, i64 1), i32* getelementptr inbounds ([2 x i32], [2 x i32]* @g, i64 0, i64 0))
+;
+  %gep1 = getelementptr inbounds [2 x i32], [2 x i32]* @g, i64 0, i64 0
+  %gep2 = getelementptr inbounds [2 x i32], [2 x i32]* @g, i64 0, i64 1
+  %cmp = icmp sgt i32* %gep2, %gep1
+  ret i1 %cmp
+}
+
+define i1 @global_gep_ugt_global_gep_complex() {
+; CHECK-LABEL: @global_gep_ugt_global_gep_complex(
+; CHECK-NEXT:    ret i1 icmp ugt (i32* bitcast (i8* getelementptr inbounds (i8, i8* bitcast ([2 x i32]* @g to i8*), i64 2) to i32*), i32* getelementptr inbounds ([2 x i32], [2 x i32]* @g, i64 0, i64 0))
+;
+  %gep1 = getelementptr inbounds [2 x i32], [2 x i32]* @g, i64 0, i64 0
+  %gep2 = getelementptr inbounds [2 x i32], [2 x i32]* @g, i64 0, i64 0
+  %gep2.cast = bitcast i32* %gep2 to i8*
+  %gep3 = getelementptr inbounds i8, i8* %gep2.cast, i64 2
+  %gep3.cast = bitcast i8* %gep3 to i32*
+  %cmp = icmp ugt i32* %gep3.cast, %gep1
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstSimplify/icmp-constant.ll b/llvm/test/Transforms/InstSimplify/icmp-constant.ll
--- a/llvm/test/Transforms/InstSimplify/icmp-constant.ll
+++ b/llvm/test/Transforms/InstSimplify/icmp-constant.ll
@@ -623,6 +623,72 @@
   ret i1 %cmp
 }
 
+define i1 @add_nsw_sgt(i8 %x) {
+; CHECK-LABEL: @add_nsw_sgt(
+; CHECK-NEXT:    ret i1 true
+;
+  %add = add nsw i8 %x, 5
+  %cmp = icmp sgt i8 %add, -124
+  ret i1 %cmp
+}
+
+; nuw should not inhibit the fold.
+
+define i1 @add_nsw_nuw_sgt(i8 %x) {
+; CHECK-LABEL: @add_nsw_nuw_sgt(
+; CHECK-NEXT:    ret i1 true
+;
+  %add = add nsw nuw i8 %x, 5
+  %cmp = icmp sgt i8 %add, -124
+  ret i1 %cmp
+}
+
+; negative test - minimum x is -128, so add could be -124.
+
+define i1 @add_nsw_sgt_limit(i8 %x) {
+; CHECK-LABEL: @add_nsw_sgt_limit(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[X:%.*]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[ADD]], -124
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i8 %x, 4
+  %cmp = icmp sgt i8 %add, -124
+  ret i1 %cmp
+}
+
+define i1 @add_nsw_slt(i8 %x) {
+; CHECK-LABEL: @add_nsw_slt(
+; CHECK-NEXT:    ret i1 false
+;
+  %add = add nsw i8 %x, 5
+  %cmp = icmp slt i8 %add, -123
+  ret i1 %cmp
+}
+
+; nuw should not inhibit the fold.
+
+define i1 @add_nsw_nuw_slt(i8 %x) {
+; CHECK-LABEL: @add_nsw_nuw_slt(
+; CHECK-NEXT:    ret i1 false
+;
+  %add = add nsw nuw i8 %x, 5
+  %cmp = icmp slt i8 %add, -123
+  ret i1 %cmp
+}
+
+; negative test - minimum x is -128, so add could be -123.
+
+define i1 @add_nsw_slt_limit(i8 %x) {
+; CHECK-LABEL: @add_nsw_slt_limit(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ADD]], -122
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i8 %x, 5
+  %cmp = icmp slt i8 %add, -122
+  ret i1 %cmp
+}
+
 ; InstCombine can fold this, but not InstSimplify.
 
 define i1 @add_nsw_neg_const2(i32 %x) {
diff --git a/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll b/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
--- a/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
@@ -20,8 +20,8 @@
 ; CHECK:       cond_next2943:
 ; CHECK-NEXT:    br i1 false, label [[BB2982_PREHEADER:%.*]], label [[BB2928]]
 ; CHECK:       bb2982.preheader:
-; CHECK-NEXT:    store i8 undef, i8* null
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    store i8 poison, i8* null
+; CHECK-NEXT:    ret i32 poison
 ;
 cond_next2835.1:		; preds = %cond_next2861
   %tmp2922 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
diff --git a/llvm/test/Transforms/NewGVN/assume-equal.ll b/llvm/test/Transforms/NewGVN/assume-equal.ll
--- a/llvm/test/Transforms/NewGVN/assume-equal.ll
+++ b/llvm/test/Transforms/NewGVN/assume-equal.ll
@@ -31,7 +31,7 @@
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br i1 true, label [[BB2]], label [[BB2]]
 ; CHECK:       0:
-; CHECK-NEXT:    store i8 undef, i8* null, align 1
+; CHECK-NEXT:    store i8 poison, i8* null, align 1
 ; CHECK-NEXT:    ret i32 [[P]]
 ;
 entry:
@@ -82,7 +82,7 @@
 ; CHECK-NEXT:    call void @llvm.assume(i1 false)
 ; CHECK-NEXT:    ret i32 15
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i8 undef, i8* null, align 1
+; CHECK-NEXT:    store i8 poison, i8* null, align 1
 ; CHECK-NEXT:    ret i32 17
 ;
 entry:
diff --git a/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll b/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
--- a/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
+++ b/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
@@ -83,10 +83,10 @@
 ; CHECK:       bb14:
 ; CHECK-NEXT:    br label [[BB16]]
 ; CHECK:       bb15:
-; CHECK-NEXT:    store i8 undef, i8* null, align 1
+; CHECK-NEXT:    store i8 poison, i8* null, align 1
 ; CHECK-NEXT:    br label [[BB16]]
 ; CHECK:       bb16:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ undef, [[BB15]] ], [ 1, [[BB14]] ], [ 9, [[BB7]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ poison, [[BB15]] ], [ 1, [[BB14]] ], [ 9, [[BB7]] ]
 ; CHECK-NEXT:    switch i32 [[TMP17]], label [[BB19]] [
 ; CHECK-NEXT:    i32 0, label [[BB6]]
 ; CHECK-NEXT:    i32 9, label [[BB18:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr31594.ll b/llvm/test/Transforms/NewGVN/pr31594.ll
--- a/llvm/test/Transforms/NewGVN/pr31594.ll
+++ b/llvm/test/Transforms/NewGVN/pr31594.ll
@@ -77,7 +77,7 @@
 ; CHECK-NEXT:    i8 6, label [[BB8:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb8:
-; CHECK-NEXT:    store i8 undef, i8* null, align 1
+; CHECK-NEXT:    store i8 poison, i8* null, align 1
 ; CHECK-NEXT:    br label [[BB4]]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    store i8 0, i8* [[ARG]], align 1, !g !0
diff --git a/llvm/test/Transforms/NewGVN/pr31758.ll b/llvm/test/Transforms/NewGVN/pr31758.ll
--- a/llvm/test/Transforms/NewGVN/pr31758.ll
+++ b/llvm/test/Transforms/NewGVN/pr31758.ll
@@ -12,7 +12,7 @@
 ; CHECK:       bb90:
 ; CHECK-NEXT:    br label [[BB90]]
 ; CHECK:       bb138:
-; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    store i8 poison, i8* null
 ; CHECK-NEXT:    br label [[BB138:%.*]]
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr32607.ll b/llvm/test/Transforms/NewGVN/pr32607.ll
--- a/llvm/test/Transforms/NewGVN/pr32607.ll
+++ b/llvm/test/Transforms/NewGVN/pr32607.ll
@@ -5,11 +5,9 @@
 ; CHECK-NEXT:  top:
 ; CHECK-NEXT:    br label [[IF:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[TMP1:%.*]], [[IF]] ], [ undef, [[TOP:%.*]] ]
-; CHECK-NEXT:    [[TMP1]] = fadd double [[TMP0]], 1.000000e+00
 ; CHECK-NEXT:    br i1 false, label [[L50:%.*]], label [[IF]]
 ; CHECK:       L50:
-; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    store i8 poison, i8* null
 ; CHECK-NEXT:    ret void
 ;
 top:
diff --git a/llvm/test/Transforms/NewGVN/pr32838.ll b/llvm/test/Transforms/NewGVN/pr32838.ll
--- a/llvm/test/Transforms/NewGVN/pr32838.ll
+++ b/llvm/test/Transforms/NewGVN/pr32838.ll
@@ -54,7 +54,7 @@
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br i1 false, label [[FIRSTPHIBLOCK:%.*]], label [[TEMP:%.*]]
 ; CHECK:       firstphiblock:
-; CHECK-NEXT:    [[FIRSTPHI:%.*]] = phi i64 [ undef, [[IF_THEN]] ], [ [[SECONDPHI:%.*]], [[SECONDPHIBLOCK:%.*]] ]
+; CHECK-NEXT:    [[FIRSTPHI:%.*]] = phi i64 [ poison, [[IF_THEN]] ], [ [[SECONDPHI:%.*]], [[SECONDPHIBLOCK:%.*]] ]
 ; CHECK-NEXT:    br i1 undef, label %for.cond17thread-pre-split, label [[SECONDPHIBLOCK]]
 ; CHECK:       secondphiblock:
 ; CHECK-NEXT:    [[SECONDPHI]] = phi i64 [ [[THIRDPHI:%.*]], [[THIRDPHIBLOCK:%.*]] ], [ [[FIRSTPHI]], [[FIRSTPHIBLOCK]] ]
@@ -105,7 +105,7 @@
 ; CHECK-NEXT:    [[F_0:%.*]] = phi i32* [ @b, [[ENTRY:%.*]] ], [ @a, [[L1_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond.loopexit:
-; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    store i8 poison, i8* null
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_END14:%.*]], label [[FOR_COND1_PREHEADER:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr32845.ll b/llvm/test/Transforms/NewGVN/pr32845.ll
--- a/llvm/test/Transforms/NewGVN/pr32845.ll
+++ b/llvm/test/Transforms/NewGVN/pr32845.ll
@@ -13,7 +13,7 @@
 ; CHECK-NEXT:    [[F_0:%.*]] = phi i32* [ @b, [[ENTRY:%.*]] ], [ @a, [[L1_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond.loopexit:
-; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    store i8 poison, i8* null
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_END14:%.*]], label [[FOR_COND1_PREHEADER:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr33014.ll b/llvm/test/Transforms/NewGVN/pr33014.ll
--- a/llvm/test/Transforms/NewGVN/pr33014.ll
+++ b/llvm/test/Transforms/NewGVN/pr33014.ll
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:   %dipsy = load i64, i64* @c
 ; CHECK-NEXT:   br label %ph
 ; CHECK: back:                                             ; preds = %l2
-; CHECK-NEXT:   store i8 undef, i8* null
+; CHECK-NEXT:   store i8 poison, i8* null
 ; CHECK-NEXT:   br label %ph
 ; CHECK: end:                                              ; preds = %l2
 ; CHECK-NEXT:   ret void
diff --git a/llvm/test/Transforms/NewGVN/pr33187.ll b/llvm/test/Transforms/NewGVN/pr33187.ll
--- a/llvm/test/Transforms/NewGVN/pr33187.ll
+++ b/llvm/test/Transforms/NewGVN/pr33187.ll
@@ -30,10 +30,10 @@
 ; CHECK:       while.body12:
 ; CHECK-NEXT:    br i1 undef, label [[IF_END18]], label [[L]]
 ; CHECK:       L.loopexit:
-; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    store i8 poison, i8* null
 ; CHECK-NEXT:    br label [[L]]
 ; CHECK:       L:
-; CHECK-NEXT:    [[H_125]] = phi i32 [ [[H_127]], [[WHILE_BODY12]] ], [ undef, [[L_LOOPEXIT]] ]
+; CHECK-NEXT:    [[H_125]] = phi i32 [ [[H_127]], [[WHILE_BODY12]] ], [ poison, [[L_LOOPEXIT]] ]
 ; CHECK-NEXT:    br i1 undef, label [[WHILE_COND10]], label [[IF_END18]]
 ; CHECK:       if.end18:
 ; CHECK-NEXT:    [[H_126]] = phi i32 [ [[H_125]], [[L]] ], [ [[H_127]], [[WHILE_BODY12]] ]
diff --git a/llvm/test/Transforms/NewGVN/pr33204.ll b/llvm/test/Transforms/NewGVN/pr33204.ll
--- a/llvm/test/Transforms/NewGVN/pr33204.ll
+++ b/llvm/test/Transforms/NewGVN/pr33204.ll
@@ -30,7 +30,7 @@
 ; CHECK:       bb8:
 ; CHECK-NEXT:    br i1 false, label [[BB9:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb9:
-; CHECK-NEXT:    store i8 undef, i8* null, align 1
+; CHECK-NEXT:    store i8 poison, i8* null, align 1
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb10:
 ; CHECK-NEXT:    store i32 0, i32* @global, align 4, !h !0
diff --git a/llvm/test/Transforms/NewGVN/pr33461.ll b/llvm/test/Transforms/NewGVN/pr33461.ll
--- a/llvm/test/Transforms/NewGVN/pr33461.ll
+++ b/llvm/test/Transforms/NewGVN/pr33461.ll
@@ -8,7 +8,7 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[FOR_COND1:%.*]], label [[FOR_INC:%.*]]
 ; CHECK:       for.cond1:
-; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i16 [ undef, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i16 [ poison, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC]] ]
 ; CHECK-NEXT:    store i16 [[PHIOFOPS]], i16* @b, align 2
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
diff --git a/llvm/test/Transforms/NewGVN/pr33720.ll b/llvm/test/Transforms/NewGVN/pr33720.ll
--- a/llvm/test/Transforms/NewGVN/pr33720.ll
+++ b/llvm/test/Transforms/NewGVN/pr33720.ll
@@ -31,10 +31,10 @@
 ; CHECK-NEXT:    store i64 7, i64* [[J_3:%.*]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ; CHECK:       for.cond16:
-; CHECK-NEXT:    [[J_0:%.*]] = phi i64* [ @f, [[ENTRY:%.*]] ], [ undef, [[FOR_COND20:%.*]] ], [ @e, [[FOR_COND16]] ]
+; CHECK-NEXT:    [[J_0:%.*]] = phi i64* [ @f, [[ENTRY:%.*]] ], [ poison, [[FOR_COND20:%.*]] ], [ @e, [[FOR_COND16]] ]
 ; CHECK-NEXT:    br i1 undef, label [[FOR_COND20]], label [[FOR_COND16]]
 ; CHECK:       for.cond20:
-; CHECK-NEXT:    [[J_2:%.*]] = phi i64* [ [[J_0]], [[FOR_COND16]] ], [ undef, [[IF_END24]] ]
+; CHECK-NEXT:    [[J_2:%.*]] = phi i64* [ [[J_0]], [[FOR_COND16]] ], [ poison, [[IF_END24]] ]
 ; CHECK-NEXT:    br i1 true, label [[IF_END24]], label [[FOR_COND16]]
 ; CHECK:       if.end24:
 ; CHECK-NEXT:    [[J_3]] = phi i64* [ [[J_2]], [[FOR_COND20]] ], [ undef, [[ENTRY]] ]
diff --git a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
--- a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
+++ b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
@@ -7,7 +7,7 @@
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   br i1 false, label %body, label %end
 ; CHECK:      body:
-; CHECK-NEXT:   store i8 undef, i8* null
+; CHECK-NEXT:   store i8 poison, i8* null
 ; CHECK-NEXT:   br label %end
 ; CHECK:      end:
 ; CHECK-NEXT:   ret void
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -1001,9 +1001,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1027,9 +1024,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1159,9 +1153,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1287,9 +1278,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1313,9 +1301,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1417,9 +1402,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1443,9 +1425,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1547,9 +1526,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1573,9 +1549,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1771,9 +1744,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1797,9 +1767,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -1845,9 +1812,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -2012,9 +1976,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2038,9 +1999,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2169,9 +2127,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2296,9 +2251,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2322,9 +2274,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2425,9 +2374,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2451,9 +2397,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2554,9 +2497,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2580,9 +2520,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2776,9 +2713,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2802,9 +2736,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2850,9 +2781,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2977,9 +2905,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3003,9 +2928,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3088,9 +3010,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3173,9 +3092,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3199,9 +3115,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3262,9 +3175,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3288,9 +3198,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3351,9 +3258,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3377,9 +3281,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3507,9 +3408,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3533,9 +3431,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3581,9 +3476,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; AMDGPU-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -3708,9 +3600,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -3734,9 +3623,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -3819,9 +3705,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -3904,9 +3787,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -3930,9 +3810,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -3993,9 +3870,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -4019,9 +3893,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -4082,9 +3953,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -4108,9 +3976,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -4238,9 +4103,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -4264,9 +4126,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -4312,9 +4171,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; NVPTX-DISABLED-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
diff --git a/llvm/test/Transforms/OpenMP/deduplication.ll b/llvm/test/Transforms/OpenMP/deduplication.ll
--- a/llvm/test/Transforms/OpenMP/deduplication.ll
+++ b/llvm/test/Transforms/OpenMP/deduplication.ll
@@ -19,7 +19,6 @@
 ; CHECK-DAG: @.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 ; CHECK-DAG: @.str1 = private unnamed_addr constant [23 x i8] c";file001;loc0001;0;0;;\00", align 1
 ; CHECK-DAG: @.str2 = private unnamed_addr constant [23 x i8] c";file002;loc0002;0;0;;\00", align 1
-; CHECK-DAG: @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
 ; UTC_ARGS: --enable
 
 
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -13,7 +13,7 @@
 ; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
 ;.
 define weak void @kernel0() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel0
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
--- a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -4694,10 +4694,10 @@
 ; CHECK1-NEXT:  entry:
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -4719,8 +4719,8 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined.(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    call void @.omp_outlined..1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK1-NEXT:    br label [[ENTRY_SPLIT:%.*]]
 ; CHECK1:       entry.split:
@@ -4753,7 +4753,7 @@
 ; CHECK1-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
 ; CHECK1-NEXT:  entry:
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
-; CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @[[GLOB1]])
+; CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @[[GLOB1:[0-9]+]])
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
 ; CHECK1-NEXT:    call void @__kmpc_push_proc_bind(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @[[GLOB1]], i32 [[TMP0]], i32 noundef 3)
 ; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @[[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
@@ -4837,10 +4837,10 @@
 ; CHECK1-NEXT:  entry:
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -4864,15 +4864,15 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..8(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK1-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK1:       omp_region.end:
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK1:       omp.par.merged.split.split:
 ; CHECK1-NEXT:    call void @.omp_outlined..9(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
@@ -4893,7 +4893,7 @@
 ; CHECK1:       omp.par.merged.split:
 ; CHECK1-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK1:       omp_region.body.split:
-; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK1:       omp.par.outlined.exit.exitStub:
 ; CHECK1-NEXT:    ret void
@@ -4921,11 +4921,11 @@
 ; CHECK1-NEXT:    [[F_RELOADED:%.*]] = alloca float, align 4
 ; CHECK1-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
 ; CHECK1-NEXT:    store float [[F]], float* [[F_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    store float [[F]], float* [[F_RELOADED]], align 4
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*, float*, float*)* @merge_seq_float..omp_par to void (i32*, i32*, ...)*), float* [[F_RELOADED]], float* [[F_ADDR]], float* [[P]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*, float*, float*)* @merge_seq_float..omp_par to void (i32*, i32*, ...)*), float* [[F_RELOADED]], float* [[F_ADDR]], float* [[P]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -4948,15 +4948,15 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..10(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK1-NEXT:    br i1 [[TMP3]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK1:       omp_region.end:
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK1:       omp.par.merged.split.split:
 ; CHECK1-NEXT:    call void @.omp_outlined..11(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
@@ -4976,7 +4976,7 @@
 ; CHECK1:       omp.par.merged.split:
 ; CHECK1-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK1:       omp_region.body.split:
-; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK1:       omp.par.outlined.exit.exitStub:
 ; CHECK1-NEXT:    ret void
@@ -5006,10 +5006,10 @@
 ; CHECK1-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC:%.*]] = alloca i64, align 8
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64*)* @merge_seq_firstprivate..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64*)* @merge_seq_firstprivate..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5033,15 +5033,15 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..12(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK1-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK1:       omp_region.end:
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK1:       omp.par.merged.split.split:
 ; CHECK1-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_LOAD:%.*]] = load i64, i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]], align 8
@@ -5065,7 +5065,7 @@
 ; CHECK1:       omp.par.merged.split:
 ; CHECK1-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK1:       omp_region.body.split:
-; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK1:       omp.par.outlined.exit.exitStub:
 ; CHECK1-NEXT:    ret void
@@ -5092,10 +5092,10 @@
 ; CHECK1-NEXT:  entry:
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq_sink_lt..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq_sink_lt..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5118,15 +5118,15 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..14(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK1-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK1:       omp_region.end:
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK1:       omp.par.merged.split.split:
 ; CHECK1-NEXT:    call void @.omp_outlined..15(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
@@ -5150,7 +5150,7 @@
 ; CHECK1:       omp.par.merged.split:
 ; CHECK1-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK1:       omp_region.body.split:
-; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK1:       omp.par.outlined.exit.exitStub:
 ; CHECK1-NEXT:    ret void
@@ -5179,13 +5179,13 @@
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_RELOADED]], align 4
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
 ; CHECK1-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[B]] to i8*
 ; CHECK1-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_seq_par_use..omp_par to void (i32*, i32*, ...)*), i32* [[A_RELOADED]], i32* [[A_ADDR]], i32* [[B]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_seq_par_use..omp_par to void (i32*, i32*, ...)*), i32* [[A_RELOADED]], i32* [[A_ADDR]], i32* [[B]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5210,15 +5210,15 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..16(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK1-NEXT:    br i1 [[TMP3]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK1:       omp_region.end:
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK1:       omp.par.merged.split.split:
 ; CHECK1-NEXT:    call void @.omp_outlined..17(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
@@ -5239,7 +5239,7 @@
 ; CHECK1:       omp.par.merged.split:
 ; CHECK1-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK1:       omp_region.body.split:
-; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK1:       omp.par.outlined.exit.exitStub:
 ; CHECK1-NEXT:    ret void
@@ -5270,10 +5270,10 @@
 ; CHECK1-NEXT:    [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
 ; CHECK1-NEXT:    store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5295,8 +5295,8 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..18(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    call void @.omp_outlined..19(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
 ; CHECK1-NEXT:    br label [[ENTRY_SPLIT:%.*]]
 ; CHECK1:       entry.split:
@@ -5345,11 +5345,11 @@
 ; CHECK1-NEXT:    [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
 ; CHECK1-NEXT:    store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_RELOADED]], align 4
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_cancellable_regions_seq..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_RELOADED]], i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_cancellable_regions_seq..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_RELOADED]], i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5372,15 +5372,15 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..20(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK1-NEXT:    br i1 [[TMP3]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK1:       omp_region.end:
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK1:       omp.par.merged.split.split:
 ; CHECK1-NEXT:    call void @.omp_outlined..21(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
@@ -5401,7 +5401,7 @@
 ; CHECK1:       omp.par.merged.split:
 ; CHECK1-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK1:       omp_region.body.split:
-; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK1:       omp.par.outlined.exit.exitStub:
 ; CHECK1-NEXT:    ret void
@@ -5440,10 +5440,10 @@
 ; CHECK1-NEXT:  entry:
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_3..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_3..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5465,11 +5465,11 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..22(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    call void @.omp_outlined..23(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK1-NEXT:    call void @.omp_outlined..24(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK1-NEXT:    br label [[ENTRY_SPLIT:%.*]]
 ; CHECK1:       entry.split:
@@ -5514,11 +5514,11 @@
 ; CHECK1-NEXT:    [[ADD_SEQ_OUTPUT_ALLOC:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_RELOADED]], align 4
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*)* @merge_3_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_RELOADED]], i32* [[A_ADDR]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*)* @merge_3_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_RELOADED]], i32* [[A_ADDR]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5543,27 +5543,27 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..25(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK1-NEXT:    br i1 [[TMP3]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK1:       omp_region.end:
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK1:       omp.par.merged.split.split:
 ; CHECK1-NEXT:    call void @.omp_outlined..26(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 ; CHECK1-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP4]], 0
 ; CHECK1-NEXT:    br i1 [[TMP5]], label [[OMP_REGION_BODY5:%.*]], label [[OMP_REGION_END4:%.*]]
 ; CHECK1:       omp_region.end4:
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT_SPLIT:%.*]]
 ; CHECK1:       omp.par.merged.split.split.split.split:
 ; CHECK1-NEXT:    call void @.omp_outlined..27(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
@@ -5584,7 +5584,7 @@
 ; CHECK1:       omp.par.merged.split.split.split:
 ; CHECK1-NEXT:    br label [[OMP_REGION_BODY5_SPLIT:%.*]]
 ; CHECK1:       omp_region.body5.split:
-; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 ; CHECK1-NEXT:    br label [[OMP_REGION_END4]]
 ; CHECK1:       omp_region.body:
 ; CHECK1-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
@@ -5595,7 +5595,7 @@
 ; CHECK1:       omp.par.merged.split:
 ; CHECK1-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK1:       omp_region.body.split:
-; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK1:       omp.par.outlined.exit.exitStub:
 ; CHECK1-NEXT:    ret void
@@ -5741,10 +5741,10 @@
 ; CHECK1-NEXT:  entry:
 ; CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK1-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK1-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK1:       omp_parallel:
-; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_2_unmergable_1..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK1-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_2_unmergable_1..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK1-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK1:       omp.par.outlined.exit:
 ; CHECK1-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5768,8 +5768,8 @@
 ; CHECK1-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK1:       omp.par.merged:
 ; CHECK1-NEXT:    call void @.omp_outlined..37(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK1-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK1-NEXT:    call void @.omp_outlined..38(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK1-NEXT:    br label [[ENTRY_SPLIT:%.*]]
 ; CHECK1:       entry.split:
@@ -5811,10 +5811,10 @@
 ; CHECK2-NEXT:  entry:
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5836,8 +5836,8 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined.(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    call void @.omp_outlined..1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK2-NEXT:    br label [[ENTRY_SPLIT:%.*]]
 ; CHECK2:       entry.split:
@@ -5870,7 +5870,7 @@
 ; CHECK2-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
 ; CHECK2-NEXT:  entry:
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
-; CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @[[GLOB1]])
+; CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @[[GLOB1:[0-9]+]])
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
 ; CHECK2-NEXT:    call void @__kmpc_push_proc_bind(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @[[GLOB1]], i32 [[TMP0]], i32 noundef 3)
 ; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @[[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
@@ -5954,10 +5954,10 @@
 ; CHECK2-NEXT:  entry:
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -5981,15 +5981,15 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..8(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK2-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK2:       omp_region.end:
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split:
 ; CHECK2-NEXT:    call void @.omp_outlined..9(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
@@ -6010,7 +6010,7 @@
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
-; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.outlined.exit.exitStub:
 ; CHECK2-NEXT:    ret void
@@ -6038,11 +6038,11 @@
 ; CHECK2-NEXT:    [[F_RELOADED:%.*]] = alloca float, align 4
 ; CHECK2-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
 ; CHECK2-NEXT:    store float [[F]], float* [[F_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    store float [[F]], float* [[F_RELOADED]], align 4
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*, float*, float*)* @merge_seq_float..omp_par to void (i32*, i32*, ...)*), float* [[F_RELOADED]], float* [[F_ADDR]], float* [[P]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*, float*, float*)* @merge_seq_float..omp_par to void (i32*, i32*, ...)*), float* [[F_RELOADED]], float* [[F_ADDR]], float* [[P]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6065,15 +6065,15 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..10(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK2-NEXT:    br i1 [[TMP3]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK2:       omp_region.end:
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split:
 ; CHECK2-NEXT:    call void @.omp_outlined..11(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
@@ -6093,7 +6093,7 @@
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
-; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.outlined.exit.exitStub:
 ; CHECK2-NEXT:    ret void
@@ -6123,10 +6123,10 @@
 ; CHECK2-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC:%.*]] = alloca i64, align 8
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64*)* @merge_seq_firstprivate..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64*)* @merge_seq_firstprivate..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6150,15 +6150,15 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..12(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK2-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK2:       omp_region.end:
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split:
 ; CHECK2-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_LOAD:%.*]] = load i64, i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]], align 8
@@ -6182,7 +6182,7 @@
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
-; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.outlined.exit.exitStub:
 ; CHECK2-NEXT:    ret void
@@ -6209,10 +6209,10 @@
 ; CHECK2-NEXT:  entry:
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq_sink_lt..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq_sink_lt..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6235,15 +6235,15 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..14(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK2-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK2:       omp_region.end:
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split:
 ; CHECK2-NEXT:    call void @.omp_outlined..15(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
@@ -6267,7 +6267,7 @@
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
-; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.outlined.exit.exitStub:
 ; CHECK2-NEXT:    ret void
@@ -6296,13 +6296,13 @@
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_RELOADED]], align 4
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
 ; CHECK2-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[B]] to i8*
 ; CHECK2-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_seq_par_use..omp_par to void (i32*, i32*, ...)*), i32* [[A_RELOADED]], i32* [[A_ADDR]], i32* [[B]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_seq_par_use..omp_par to void (i32*, i32*, ...)*), i32* [[A_RELOADED]], i32* [[A_ADDR]], i32* [[B]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6327,15 +6327,15 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..16(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK2-NEXT:    br i1 [[TMP3]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK2:       omp_region.end:
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split:
 ; CHECK2-NEXT:    call void @.omp_outlined..17(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
@@ -6356,7 +6356,7 @@
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
-; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.outlined.exit.exitStub:
 ; CHECK2-NEXT:    ret void
@@ -6387,10 +6387,10 @@
 ; CHECK2-NEXT:    [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
 ; CHECK2-NEXT:    store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6412,8 +6412,8 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..18(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    call void @.omp_outlined..19(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
 ; CHECK2-NEXT:    br label [[ENTRY_SPLIT:%.*]]
 ; CHECK2:       entry.split:
@@ -6462,11 +6462,11 @@
 ; CHECK2-NEXT:    [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
 ; CHECK2-NEXT:    store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_RELOADED]], align 4
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_cancellable_regions_seq..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_RELOADED]], i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_cancellable_regions_seq..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_RELOADED]], i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6489,15 +6489,15 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..20(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK2-NEXT:    br i1 [[TMP3]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK2:       omp_region.end:
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split:
 ; CHECK2-NEXT:    call void @.omp_outlined..21(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
@@ -6518,7 +6518,7 @@
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
-; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.outlined.exit.exitStub:
 ; CHECK2-NEXT:    ret void
@@ -6557,10 +6557,10 @@
 ; CHECK2-NEXT:  entry:
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_3..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_3..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6582,11 +6582,11 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..22(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    call void @.omp_outlined..23(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK2-NEXT:    call void @.omp_outlined..24(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK2-NEXT:    br label [[ENTRY_SPLIT:%.*]]
 ; CHECK2:       entry.split:
@@ -6631,11 +6631,11 @@
 ; CHECK2-NEXT:    [[ADD_SEQ_OUTPUT_ALLOC:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_RELOADED]], align 4
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*)* @merge_3_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_RELOADED]], i32* [[A_ADDR]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*, i32*)* @merge_3_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_RELOADED]], i32* [[A_ADDR]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6660,27 +6660,27 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..25(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK2-NEXT:    br i1 [[TMP3]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
 ; CHECK2:       omp_region.end:
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split:
 ; CHECK2-NEXT:    call void @.omp_outlined..26(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 ; CHECK2-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP4]], 0
 ; CHECK2-NEXT:    br i1 [[TMP5]], label [[OMP_REGION_BODY5:%.*]], label [[OMP_REGION_END4:%.*]]
 ; CHECK2:       omp_region.end4:
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split.split.split:
 ; CHECK2-NEXT:    call void @.omp_outlined..27(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
@@ -6701,7 +6701,7 @@
 ; CHECK2:       omp.par.merged.split.split.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY5_SPLIT:%.*]]
 ; CHECK2:       omp_region.body5.split:
-; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END4]]
 ; CHECK2:       omp_region.body:
 ; CHECK2-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
@@ -6712,7 +6712,7 @@
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
-; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    call void @__kmpc_end_master(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.outlined.exit.exitStub:
 ; CHECK2-NEXT:    ret void
@@ -6858,10 +6858,10 @@
 ; CHECK2-NEXT:  entry:
 ; CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
 ; CHECK2-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK2:       omp_parallel:
-; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_2_unmergable_1..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK2-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB2]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_2_unmergable_1..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK2:       omp.par.outlined.exit:
 ; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
@@ -6885,8 +6885,8 @@
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK2:       omp.par.merged:
 ; CHECK2-NEXT:    call void @.omp_outlined..37(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
-; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK2-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
+; CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    call void @.omp_outlined..38(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK2-NEXT:    br label [[ENTRY_SPLIT:%.*]]
 ; CHECK2:       entry.split:
diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -32,7 +32,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i8 1, i1 false, i1 true)
 ; CHECK-NEXT:    call void @foo() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    call void @bar() #[[ATTR4]]
-; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i8 1, i1 true)
 ; CHECK-NEXT:    ret void
 ;
@@ -41,7 +41,7 @@
 ; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i8 1, i1 false, i1 true)
 ; CHECK-DISABLED-NEXT:    call void @foo() #[[ATTR4:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    call void @bar() #[[ATTR4]]
-; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp()
+; CHECK-DISABLED-NEXT:    call void @unknown_no_openmp() #[[ATTR3:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i8 1, i1 true)
 ; CHECK-DISABLED-NEXT:    ret void
 ;
@@ -183,14 +183,14 @@
 ; CHECK: attributes #[[ATTR0]] = { nosync nounwind }
 ; CHECK: attributes #[[ATTR1]] = { nofree nosync nounwind readnone willreturn }
 ; CHECK: attributes #[[ATTR2]] = { nofree nosync nounwind willreturn writeonly }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK: attributes #[[ATTR3]] = { "llvm.assume"="omp_no_openmp" }
 ; CHECK: attributes #[[ATTR4]] = { nounwind }
 ; CHECK: attributes #[[ATTR5]] = { nosync nounwind writeonly }
 ;.
 ; CHECK-DISABLED: attributes #[[ATTR0]] = { nosync nounwind }
 ; CHECK-DISABLED: attributes #[[ATTR1]] = { nofree nosync nounwind readnone willreturn }
 ; CHECK-DISABLED: attributes #[[ATTR2]] = { nofree nosync nounwind willreturn writeonly }
-; CHECK-DISABLED: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK-DISABLED: attributes #[[ATTR3]] = { "llvm.assume"="omp_no_openmp" }
 ; CHECK-DISABLED: attributes #[[ATTR4]] = { nounwind }
 ; CHECK-DISABLED: attributes #[[ATTR5]] = { nosync nounwind writeonly }
 ;.
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -20,7 +20,7 @@
 define dso_local void @foo() {
 entry:
   %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true)
-  %x = call i8* @__kmpc_alloc_shared(i64 4)
+  %x = call align 4 i8* @__kmpc_alloc_shared(i64 4)
   call void @unknown_no_openmp()
   %x_on_stack = bitcast i8* %x to i32*
   %0 = bitcast i32* %x_on_stack to i8*
@@ -36,7 +36,7 @@
   %cmp = icmp eq i32 %c, -1
   br i1 %cmp, label %master1, label %exit
 master1:
-  %x = call i8* @__kmpc_alloc_shared(i64 16), !dbg !11
+  %x = call align 4 i8* @__kmpc_alloc_shared(i64 16), !dbg !11
   %x_on_stack = bitcast i8* %x to [4 x i32]*
   %a0 = bitcast [4 x i32]* %x_on_stack to i8*
   call void @use(i8* %a0)
@@ -47,7 +47,7 @@
   %b0 = icmp eq i32 %c, -1
   br i1 %b0, label %master2, label %exit
 master2:
-  %y = call i8* @__kmpc_alloc_shared(i64 4), !dbg !12
+  %y = call align 4 i8* @__kmpc_alloc_shared(i64 4), !dbg !12
   %y_on_stack = bitcast i8* %y to [4 x i32]*
   %b1 = bitcast [4 x i32]* %y_on_stack to i8*
   call void @use(i8* %b1)
@@ -64,7 +64,7 @@
   %c0 = icmp eq i32 %c, -1
   br i1 %c0, label %master3, label %exit
 master3:
-  %z = call i8* @__kmpc_alloc_shared(i64 24), !dbg !12
+  %z = call align 4 i8* @__kmpc_alloc_shared(i64 24), !dbg !12
   %z_on_stack = bitcast i8* %z to [6 x i32]*
   %c1 = bitcast [6 x i32]* %z_on_stack to i8*
   call void @use(i8* %c1)
@@ -119,15 +119,15 @@
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global i8*
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [113 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] undef, align 32
-; CHECK: @[[Y:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
+; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] undef, align 4
+; CHECK: @[[Y:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
 ;.
 ; CHECK-LABEL: define {{[^@]+}}@foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
-; CHECK-NEXT:    [[X:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    call void @unknown_no_openmp()
-; CHECK-NEXT:    call void @use.internalized(i8* nofree writeonly [[X]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    [[X:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 4) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @use.internalized(i8* nofree writeonly align 4 [[X]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[X]], i64 4) #[[ATTR5]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
 ; CHECK-NEXT:    ret void
@@ -135,17 +135,17 @@
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bar() {
 ; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
-; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR4]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[MASTER1:%.*]], label [[EXIT:%.*]]
 ; CHECK:       master1:
-; CHECK-NEXT:    call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*)) #[[ATTR6]]
+; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*)) #[[ATTR6]]
 ; CHECK-NEXT:    br label [[NEXT:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR4]]
 ; CHECK-NEXT:    br label [[MASTER2:%.*]]
 ; CHECK:       master2:
-; CHECK-NEXT:    call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y, i32 0, i32 0) to i8*)) #[[ATTR6]]
+; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y, i32 0, i32 0) to i8*)) #[[ATTR6]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -154,12 +154,12 @@
 ;
 ; CHECK-LABEL: define {{[^@]+}}@baz_spmd() {
 ; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 true, i1 true)
-; CHECK-NEXT:    call void @unknown_no_openmp()
+; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR4]]
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[C0]], label [[MASTER3:%.*]], label [[EXIT:%.*]]
 ; CHECK:       master3:
-; CHECK-NEXT:    [[Z:%.*]] = call i8* @__kmpc_alloc_shared(i64 24) #[[ATTR5]], !dbg [[DBG9:![0-9]+]]
-; CHECK-NEXT:    call void @use.internalized(i8* nofree [[Z]]) #[[ATTR6]]
+; CHECK-NEXT:    [[Z:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 24) #[[ATTR5]], !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    call void @use.internalized(i8* nofree align 4 [[Z]]) #[[ATTR6]]
 ; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[Z]], i64 24) #[[ATTR5]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -169,7 +169,7 @@
 ;
 ; CHECK: Function Attrs: nofree nosync nounwind willreturn writeonly
 ; CHECK-LABEL: define {{[^@]+}}@use.internalized
-; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (i8* nofree writeonly align 4 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store i8* [[X]], i8** @S, align 8
 ; CHECK-NEXT:    ret void
@@ -186,7 +186,7 @@
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nosync nounwind }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind readnone speculatable }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nofree nosync nounwind readnone speculatable willreturn }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK: attributes #[[ATTR4]] = { "llvm.assume"="omp_no_openmp" }
 ; CHECK: attributes #[[ATTR5]] = { nounwind }
 ; CHECK: attributes #[[ATTR6]] = { nounwind writeonly }
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -113,8 +113,9 @@
 ; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; AMDGPU: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
-; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
-; AMDGPU: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
+; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
+; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
+; AMDGPU: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
 ; AMDGPU: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
@@ -126,8 +127,9 @@
 ; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; NVPTX: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
-; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
-; NVPTX: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
+; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
+; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
+; NVPTX: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
 ; NVPTX: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 ; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
@@ -139,8 +141,8 @@
 ; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; AMDGPU-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
-; AMDGPU-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
-; AMDGPU-DISABLED: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
+; AMDGPU-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
+; AMDGPU-DISABLED: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -156,8 +158,8 @@
 ; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
 ; NVPTX-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
-; NVPTX-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
-; NVPTX-DISABLED: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
+; NVPTX-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
+; NVPTX-DISABLED: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -248,7 +250,6 @@
 ; AMDGPU-DISABLED-NEXT:    ret void
 ; AMDGPU-DISABLED:       user_code.entry:
 ; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -302,7 +303,6 @@
 ; NVPTX-DISABLED-NEXT:    ret void
 ; NVPTX-DISABLED:       user_code.entry:
 ; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -465,8 +465,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -477,8 +475,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -489,8 +485,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -501,8 +495,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -603,7 +595,6 @@
 ; AMDGPU-DISABLED-NEXT:    ret void
 ; AMDGPU-DISABLED:       user_code.entry:
 ; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -657,7 +648,6 @@
 ; NVPTX-DISABLED-NEXT:    ret void
 ; NVPTX-DISABLED:       user_code.entry:
 ; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -688,7 +678,7 @@
 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; AMDGPU-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; AMDGPU-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
 ; AMDGPU-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
 ; AMDGPU-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]]
 ; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
@@ -710,7 +700,7 @@
 ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; NVPTX-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; NVPTX-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
 ; NVPTX-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
 ; NVPTX-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]]
 ; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
@@ -732,7 +722,7 @@
 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
 ; AMDGPU-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
 ; AMDGPU-DISABLED-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]]
 ; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
@@ -754,7 +744,7 @@
 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
 ; NVPTX-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
 ; NVPTX-DISABLED-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]]
 ; NVPTX-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
@@ -774,7 +764,7 @@
 ;
 entry:
   %captured_vars_addrs = alloca [0 x i8*], align 8
-  %x = call i8* @__kmpc_alloc_shared(i64 4)
+  %x = call align 4 i8* @__kmpc_alloc_shared(i64 4)
   %x_on_stack = bitcast i8* %x to i32*
   call void @use(i32* nocapture %x_on_stack) #10
   br label %for.cond
@@ -835,8 +825,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -847,8 +835,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -859,8 +845,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -871,8 +855,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
@@ -974,7 +956,6 @@
 ; AMDGPU-DISABLED-NEXT:    ret void
 ; AMDGPU-DISABLED:       user_code.entry:
 ; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -1028,7 +1009,6 @@
 ; NVPTX-DISABLED-NEXT:    ret void
 ; NVPTX-DISABLED:       user_code.entry:
 ; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -1141,7 +1121,7 @@
 ;
 entry:
   %captured_vars_addrs = alloca [1 x i8*], align 8
-  %x = call i8* @__kmpc_alloc_shared(i64 4)
+  %x = call align 4 i8* @__kmpc_alloc_shared(i64 4)
   %x_on_stack = bitcast i8* %x to i32*
   br label %for.cond
 
@@ -1219,8 +1199,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
 ; AMDGPU-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
@@ -1234,8 +1212,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
 ; NVPTX-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
@@ -1249,8 +1225,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
 ; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
@@ -1264,8 +1238,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
 ; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
@@ -1372,7 +1344,6 @@
 ; AMDGPU-DISABLED-NEXT:    ret void
 ; AMDGPU-DISABLED:       user_code.entry:
 ; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -1426,7 +1397,6 @@
 ; NVPTX-DISABLED-NEXT:    ret void
 ; NVPTX-DISABLED:       user_code.entry:
 ; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
@@ -1469,7 +1439,7 @@
 ; AMDGPU:       region.guarded.end:
 ; AMDGPU-NEXT:    br label [[REGION_BARRIER]]
 ; AMDGPU:       region.barrier:
-; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; AMDGPU-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 ; AMDGPU-NEXT:    br label [[REGION_EXIT:%.*]]
 ; AMDGPU:       region.exit:
 ; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
@@ -1505,7 +1475,7 @@
 ; NVPTX:       region.guarded.end:
 ; NVPTX-NEXT:    br label [[REGION_BARRIER]]
 ; NVPTX:       region.barrier:
-; NVPTX-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
+; NVPTX-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 ; NVPTX-NEXT:    br label [[REGION_EXIT:%.*]]
 ; NVPTX:       region.exit:
 ; NVPTX-NEXT:    br label [[FOR_COND:%.*]]
@@ -1573,7 +1543,7 @@
 ;
 entry:
   %captured_vars_addrs = alloca [1 x i8*], align 8
-  %x = call i8* @__kmpc_alloc_shared(i64 4)
+  %x = call align 4 i8* @__kmpc_alloc_shared(i64 4)
   %x_on_stack = bitcast i8* %x to i32*
   store i32 42, i32* %x_on_stack, align 4, !tbaa !18
   br label %for.cond
@@ -1652,8 +1622,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
 ; AMDGPU-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
@@ -1667,8 +1635,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
 ; NVPTX-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
@@ -1682,8 +1648,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
 ; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
@@ -1697,8 +1661,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
 ; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
@@ -1858,7 +1820,6 @@
 ; AMDGPU-DISABLED-NEXT:    ret void
 ; AMDGPU-DISABLED:       user_code.entry:
 ; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
 ; AMDGPU-DISABLED-NEXT:    br label [[COMMON_RET]]
@@ -1905,7 +1866,6 @@
 ; NVPTX-DISABLED-NEXT:    ret void
 ; NVPTX-DISABLED:       user_code.entry:
 ; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
 ; NVPTX-DISABLED-NEXT:    br label [[COMMON_RET]]
@@ -2323,8 +2283,6 @@
 ; AMDGPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-NEXT:    ret void
@@ -2335,8 +2293,6 @@
 ; NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-NEXT:    ret void
@@ -2347,8 +2303,6 @@
 ; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; AMDGPU-DISABLED-NEXT:    ret void
@@ -2359,8 +2313,6 @@
 ; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; NVPTX-DISABLED-NEXT:    ret void
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -25,6 +25,7 @@
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
 ; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_fd02_404433c2_main_l5_exec_mode], section "llvm.metadata"
+; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
 ;.
 define weak void @__omp_offloading_fd02_404433c2_main_l5(double* nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_404433c2_main_l5
@@ -50,7 +51,7 @@
 ; CHECK:       region.guarded.end:
 ; CHECK-NEXT:    br label [[REGION_BARRIER]]
 ; CHECK:       region.barrier:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
 ; CHECK-NEXT:    br label [[REGION_EXIT:%.*]]
 ; CHECK:       region.exit:
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -47,6 +47,7 @@
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
 ; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata"
+; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
 ;.
 ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
@@ -81,7 +82,7 @@
 ; CHECK:       region.guarded.end:
 ; CHECK-NEXT:    br label [[REGION_BARRIER]]
 ; CHECK:       region.barrier:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
 ; CHECK-NEXT:    br label [[REGION_EXIT:%.*]]
 ; CHECK:       region.exit:
 ; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
@@ -105,7 +106,7 @@
 ; CHECK:       region.guarded.end1:
 ; CHECK-NEXT:    br label [[REGION_BARRIER2]]
 ; CHECK:       region.barrier2:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]])
 ; CHECK-NEXT:    br label [[REGION_EXIT3]]
 ; CHECK:       region.exit3:
 ; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1
@@ -126,7 +127,7 @@
 ; CHECK:       region.guarded.end6:
 ; CHECK-NEXT:    br label [[REGION_BARRIER7]]
 ; CHECK:       region.barrier7:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]])
 ; CHECK-NEXT:    br label [[REGION_EXIT8:%.*]]
 ; CHECK:       region.exit8:
 ; CHECK-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8
@@ -143,7 +144,7 @@
 ; CHECK:       region.guarded.end11:
 ; CHECK-NEXT:    br label [[REGION_BARRIER12]]
 ; CHECK:       region.barrier12:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]])
 ; CHECK-NEXT:    br label [[REGION_EXIT13:%.*]]
 ; CHECK:       region.exit13:
 ; CHECK-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8
@@ -160,7 +161,7 @@
 ; CHECK:       region.guarded.end16:
 ; CHECK-NEXT:    br label [[REGION_BARRIER17]]
 ; CHECK:       region.barrier17:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP10]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP10]])
 ; CHECK-NEXT:    br label [[REGION_EXIT18:%.*]]
 ; CHECK:       region.exit18:
 ; CHECK-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -193,9 +193,6 @@
 ; CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
-; CHECK-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
 ; CHECK-NEXT:    call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-2 | FileCheck %s
+
+define void @foo(i32* noalias nocapture writeonly %B, i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %C, i32 %n, i32 %m) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[N:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP1]], [[M:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL2]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[MUL4:%.*]] = mul nsw i32 [[ADD]], [[TMP2]]
+; CHECK-NEXT:    store i32 [[MUL4]], i32* [[B:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], [[M]]
+; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[MUL9]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[ADD10]], [[TMP4]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
+; CHECK-NEXT:    store i32 [[MUL12]], i32* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[MUL15:%.*]] = mul nsw i32 [[TMP2]], [[N]]
+; CHECK-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[TMP4]], [[M]]
+; CHECK-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL15]]
+; CHECK-NEXT:    [[MUL20:%.*]] = mul nsw i32 [[ADD18]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[MUL25:%.*]] = mul nsw i32 [[TMP5]], [[M]]
+; CHECK-NEXT:    [[ADD26:%.*]] = add nsw i32 [[MUL25]], [[MUL15]]
+; CHECK-NEXT:    [[MUL28:%.*]] = mul nsw i32 [[ADD26]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    store i32 [[MUL28]], i32* [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  %mul = mul nsw i32 %0, %n
+  %1 = load i32, i32* %C, align 4
+  %mul2 = mul nsw i32 %1, %m
+  %add = add nsw i32 %mul2, %mul
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul4 = mul nsw i32 %add, %2
+  store i32 %mul4, i32* %B, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %C, i64 1
+  %3 = load i32, i32* %arrayidx8, align 4
+  %mul9 = mul nsw i32 %3, %m
+  %add10 = add nsw i32 %mul9, %mul
+  %arrayidx11 = getelementptr inbounds i32, i32* %C, i64 2
+  %4 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %add10, %4
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %mul12, i32* %arrayidx13, align 4
+  %mul15 = mul nsw i32 %2, %n
+  %mul17 = mul nsw i32 %4, %m
+  %add18 = add nsw i32 %mul17, %mul15
+  %mul20 = mul nsw i32 %add18, %0
+  %arrayidx21 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %mul20, i32* %arrayidx21, align 4
+  %arrayidx24 = getelementptr inbounds i32, i32* %C, i64 3
+  %5 = load i32, i32* %arrayidx24, align 4
+  %mul25 = mul nsw i32 %5, %m
+  %add26 = add nsw i32 %mul25, %mul15
+  %mul28 = mul nsw i32 %add26, %1
+  %arrayidx29 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %mul28, i32* %arrayidx29, align 4
+  ret void
+}
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c"
 # CHECK-NEXT: error: DIE address ranges are not contained in its parent's ranges:
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_debug_info.s b/llvm/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
@@ -46,6 +46,7 @@
 # CHECK-NEXT: DW_AT_decl_file [DW_FORM_data1]	(0x01)
 # CHECK-NEXT: DW_AT_decl_line [DW_FORM_data1]	(2)
 # CHECK-NEXT: DW_AT_use_location [DW_FORM_ref4]	(cu + 0x0053 => {0x00000053}){{[[:space:]]}}
+# CHECK-NEXT: Verifying unit: 2 / 2 
 # CHECK-NEXT: error: Compilation unit root DIE is not a unit DIE: DW_TAG_null.
 # CHECK-NEXT: error: Compilation unit type (DW_UT_compile) and root DIE (DW_TAG_null) do not match.
 
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_die_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_die_ranges.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_die_ranges.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_die_ranges.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1 
 # CHECK-NEXT: error: Invalid address range [0x0000000000000007, 0x0000000000000006)
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c"
 # CHECK-NEXT: error: DW_FORM_ref4 CU offset 0x00001234 is invalid (must be less than CU size of 0x0000001a):
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c"
 # CHECK-NEXT: error: Invalid address range
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c" 
 # CHECK-NEXT: error: DW_AT_ranges offset is beyond .debug_ranges bounds: 0x00001000
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c"
 # CHECK-NEXT: error: DW_FORM_ref_addr offset beyond .debug_info bounds:
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c" 
 # CHECK-NEXT: error: invalid DIE reference 0x00000011. Offset is in between DIEs:
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c"
 # CHECK-NEXT: error: DW_AT_ranges offset is beyond .debug_rnglists bounds: 0x00001000
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c"
 # CHECK-NEXT: error: DW_AT_stmt_list offset is beyond .debug_line bounds: 0x00001000
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units... 
+# CHECK-NEXT: Verifying unit: 1 / 1 
 # CHECK-NEXT: error: DW_FORM_strp offset 4660 is beyond .debug_str bounds:
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c" 
 # CHECK-NEXT: error: DIE address ranges are not contained in its parent's ranges:
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c" 
 # CHECK-NEXT: error: DIEs have overlapping address ranges
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml
@@ -2,6 +2,7 @@
 # RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
 
 #      CHECK: Verifying non-dwo Units...
+# CHECK-NEXT: Verifying unit: 1 / 1, "/tmp/main.c" 
 # CHECK-NEXT: error: DIEs have overlapping address ranges
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
@@ -212,6 +212,24 @@
 vminps            (%rax), %zmm17, %zmm19 {z}{k1}
 vminps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
+vmovapd           %zmm16, %zmm19
+vmovapd           (%rax), %zmm19
+vmovapd           %zmm16, (%rax)
+vmovapd           %zmm16, %zmm19 {k1}
+vmovapd           (%rax), %zmm19 {k1}
+vmovapd           %zmm16, (%rax) {k1}
+vmovapd           %zmm16, %zmm19 {z}{k1}
+vmovapd           (%rax), %zmm19 {z}{k1}
+
+vmovaps           %zmm16, %zmm19
+vmovaps           (%rax), %zmm19
+vmovaps           %zmm16, (%rax)
+vmovaps           %zmm16, %zmm19 {k1}
+vmovaps           (%rax), %zmm19 {k1}
+vmovaps           %zmm16, (%rax) {k1}
+vmovaps           %zmm16, %zmm19 {z}{k1}
+vmovaps           (%rax), %zmm19 {z}{k1}
+
 vmovddup          %zmm16, %zmm19
 vmovddup          (%rax), %zmm19
 vmovddup          %zmm16, %zmm19 {k1}
@@ -219,6 +237,42 @@
 vmovddup          %zmm16, %zmm19 {z}{k1}
 vmovddup          (%rax), %zmm19 {z}{k1}
 
+vmovdqa32         %zmm16, %zmm19
+vmovdqa32         (%rax), %zmm19
+vmovdqa32         %zmm16, (%rax)
+vmovdqa32         %zmm16, %zmm19 {k1}
+vmovdqa32         (%rax), %zmm19 {k1}
+vmovdqa32         %zmm16, (%rax) {k1}
+vmovdqa32         %zmm16, %zmm19 {z}{k1}
+vmovdqa32         (%rax), %zmm19 {z}{k1}
+
+vmovdqa64         %zmm16, %zmm19
+vmovdqa64         (%rax), %zmm19
+vmovdqa64         %zmm16, (%rax)
+vmovdqa64         %zmm16, %zmm19 {k1}
+vmovdqa64         (%rax), %zmm19 {k1}
+vmovdqa64         %zmm16, (%rax) {k1}
+vmovdqa64         %zmm16, %zmm19 {z}{k1}
+vmovdqa64         (%rax), %zmm19 {z}{k1}
+
+vmovdqu32         %zmm16, %zmm19
+vmovdqu32         (%rax), %zmm19
+vmovdqu32         %zmm16, (%rax)
+vmovdqu32         %zmm16, %zmm19 {k1}
+vmovdqu32         (%rax), %zmm19 {k1}
+vmovdqu32         %zmm16, (%rax) {k1}
+vmovdqu32         %zmm16, %zmm19 {z}{k1}
+vmovdqu32         (%rax), %zmm19 {z}{k1}
+
+vmovdqu64         %zmm16, %zmm19
+vmovdqu64         (%rax), %zmm19
+vmovdqu64         %zmm16, (%rax)
+vmovdqu64         %zmm16, %zmm19 {k1}
+vmovdqu64         (%rax), %zmm19 {k1}
+vmovdqu64         %zmm16, (%rax) {k1}
+vmovdqu64         %zmm16, %zmm19 {z}{k1}
+vmovdqu64         (%rax), %zmm19 {z}{k1}
+
 vmovshdup         %zmm16, %zmm19
 vmovshdup         (%rax), %zmm19
 vmovshdup         %zmm16, %zmm19 {k1}
@@ -233,6 +287,24 @@
 vmovsldup         %zmm16, %zmm19 {z}{k1}
 vmovsldup         (%rax), %zmm19 {z}{k1}
 
+vmovupd           %zmm16, %zmm19
+vmovupd           (%rax), %zmm19
+vmovupd           %zmm16, (%rax)
+vmovupd           %zmm16, %zmm19 {k1}
+vmovupd           (%rax), %zmm19 {k1}
+vmovupd           %zmm16, (%rax) {k1}
+vmovupd           %zmm16, %zmm19 {z}{k1}
+vmovupd           (%rax), %zmm19 {z}{k1}
+
+vmovups           %zmm16, %zmm19
+vmovups           (%rax), %zmm19
+vmovups           %zmm16, (%rax)
+vmovups           %zmm16, %zmm19 {k1}
+vmovups           (%rax), %zmm19 {k1}
+vmovups           %zmm16, (%rax) {k1}
+vmovups           %zmm16, %zmm19 {z}{k1}
+vmovups           (%rax), %zmm19 {z}{k1}
+
 vmulpd            %zmm16, %zmm17, %zmm19
 vmulpd            (%rax), %zmm17, %zmm19
 vmulpd            (%rax){1to8}, %zmm17, %zmm19
@@ -996,12 +1068,60 @@
 # CHECK-NEXT:  1      3     1.00                        vminps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      10    1.00    *                   vminps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      10    1.00    *                   vminps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovapd	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovapd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovapd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovaps	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovaps	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovaps	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vmovddup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      8     1.00    *                   vmovddup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vmovddup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa32	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa32	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa64	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa64	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu32	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu32	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu64	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu64	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19 {%k1}
@@ -1014,6 +1134,22 @@
 # CHECK-NEXT:  2      8     1.00    *                   vmovsldup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovsldup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vmovsldup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovupd	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovupd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovupd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovups	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovups	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovups	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      5     1.00                        vmulpd	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  2      12    1.00    *                   vmulpd	(%rax), %zmm17, %zmm19
 # CHECK-NEXT:  2      12    1.00    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19
@@ -1531,7 +1667,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     1506.00 138.67 201.67  -    438.67 225.50 225.50
+# CHECK-NEXT:  -     1506.00 144.67 201.67 16.00 456.67 245.50 245.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -1719,12 +1855,60 @@
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%zmm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%zmm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa32	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa32	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa64	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa64	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu32	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu32	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu64	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu64	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%zmm16, %zmm19 {%k1}
@@ -1737,6 +1921,22 @@
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovsldup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovsldup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovsldup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%zmm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%zmm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulpd	(%rax), %zmm17, %zmm19
 # CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulpd	(%rax){1to8}, %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512bw.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512bw.s
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512bw.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512bw.s
@@ -20,6 +20,24 @@
 kunpckdq          %k0, %k1, %k2
 kunpckwd          %k0, %k1, %k2
 
+vmovdqu8          %zmm16, %zmm19
+vmovdqu8          (%rax), %zmm19
+vmovdqu8          %zmm16, (%rax)
+vmovdqu8          %zmm16, %zmm19 {k1}
+vmovdqu8          (%rax), %zmm19 {k1}
+vmovdqu8          %zmm16, (%rax) {k1}
+vmovdqu8          %zmm16, %zmm19 {z}{k1}
+vmovdqu8          (%rax), %zmm19 {z}{k1}
+
+vmovdqu16         %zmm16, %zmm19
+vmovdqu16         (%rax), %zmm19
+vmovdqu16         %zmm16, (%rax)
+vmovdqu16         %zmm16, %zmm19 {k1}
+vmovdqu16         (%rax), %zmm19 {k1}
+vmovdqu16         %zmm16, (%rax) {k1}
+vmovdqu16         %zmm16, %zmm19 {z}{k1}
+vmovdqu16         (%rax), %zmm19 {z}{k1}
+
 vpabsb            %zmm16, %zmm19
 vpabsb            (%rax), %zmm19
 vpabsb            %zmm16, %zmm19 {k1}
@@ -280,6 +298,22 @@
 # CHECK-NEXT:  1      1     1.00                        kshiftrq	$2, %k1, %k2
 # CHECK-NEXT:  1      1     1.00                        kunpckdq	%k0, %k1, %k2
 # CHECK-NEXT:  1      1     1.00                        kunpckwd	%k0, %k1, %k2
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu8	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu8	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu8	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu8	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu8	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu16	(%rax), %zmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu16	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu16	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu16	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu16	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vpabsb	(%rax), %zmm19
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%zmm16, %zmm19 {%k1}
@@ -470,8 +504,8 @@
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpunpcklwd	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.33                        vpmovm2b	%k0, %zmm0
-# CHECK-NEXT:  1      1     0.33                        vpmovm2w	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2b	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2w	%k0, %zmm0
 # CHECK-NEXT:  1      1     0.33                        vpmovb2m	%zmm0, %k0
 # CHECK-NEXT:  1      1     0.33                        vpmovw2m	%zmm0, %k0
 
@@ -487,7 +521,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     10.67  47.67   -     153.67 47.50  47.50
+# CHECK-NEXT:  -      -     14.00  47.00  4.00   157.00 52.50  52.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -509,6 +543,22 @@
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     kshiftrq	$2, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     kunpckdq	%k0, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     kunpckwd	%k0, %k1, %k2
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu8	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu8	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu8	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu8	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu8	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu16	%zmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu16	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu16	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu16	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu16	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsb	%zmm16, %zmm19
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpabsb	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsb	%zmm16, %zmm19 {%k1}
@@ -699,7 +749,7 @@
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vpunpcklwd	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2b	%k0, %zmm0
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2w	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vpmovm2b	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vpmovm2w	%k0, %zmm0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovb2m	%zmm0, %k0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovw2m	%zmm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512bwvl.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512bwvl.s
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512bwvl.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512bwvl.s
@@ -1,6 +1,42 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -instruction-tables < %s | FileCheck %s
 
+vmovdqu8          %xmm16, %xmm19
+vmovdqu8          (%rax), %xmm19
+vmovdqu8          %xmm16, (%rax)
+vmovdqu8          %xmm16, %xmm19 {k1}
+vmovdqu8          (%rax), %xmm19 {k1}
+vmovdqu8          %xmm16, (%rax) {k1}
+vmovdqu8          %xmm16, %xmm19 {z}{k1}
+vmovdqu8          (%rax), %xmm19 {z}{k1}
+
+vmovdqu8          %ymm16, %ymm19
+vmovdqu8          (%rax), %ymm19
+vmovdqu8          %ymm16, (%rax)
+vmovdqu8          %ymm16, %ymm19 {k1}
+vmovdqu8          (%rax), %ymm19 {k1}
+vmovdqu8          %ymm16, (%rax) {k1}
+vmovdqu8          %ymm16, %ymm19 {z}{k1}
+vmovdqu8          (%rax), %ymm19 {z}{k1}
+
+vmovdqu16         %xmm16, %xmm19
+vmovdqu16         (%rax), %xmm19
+vmovdqu16         %xmm16, (%rax)
+vmovdqu16         %xmm16, %xmm19 {k1}
+vmovdqu16         (%rax), %xmm19 {k1}
+vmovdqu16         %xmm16, (%rax) {k1}
+vmovdqu16         %xmm16, %xmm19 {z}{k1}
+vmovdqu16         (%rax), %xmm19 {z}{k1}
+
+vmovdqu16         %ymm16, %ymm19
+vmovdqu16         (%rax), %ymm19
+vmovdqu16         %ymm16, (%rax)
+vmovdqu16         %ymm16, %ymm19 {k1}
+vmovdqu16         (%rax), %ymm19 {k1}
+vmovdqu16         %ymm16, (%rax) {k1}
+vmovdqu16         %ymm16, %ymm19 {z}{k1}
+vmovdqu16         (%rax), %ymm19 {z}{k1}
+
 vpabsb            %xmm16, %xmm19
 vpabsb            (%rax), %xmm19
 vpabsb            %xmm16, %xmm19 {k1}
@@ -476,6 +512,38 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu8	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu8	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu8	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu8	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu8	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu8	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu8	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu8	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu8	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu8	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu16	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu16	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu16	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu16	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu16	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu16	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu16	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu16	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu16	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu16	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     0.50    *                   vpabsb	(%rax), %xmm19
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%xmm16, %xmm19 {%k1}
@@ -858,8 +926,8 @@
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     0.33                        vpmovm2b	%k0, %xmm0
 # CHECK-NEXT:  1      1     0.33                        vpmovm2w	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.33                        vpmovm2b	%k0, %ymm0
-# CHECK-NEXT:  1      1     0.33                        vpmovm2w	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2b	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2w	%k0, %ymm0
 # CHECK-NEXT:  1      1     0.33                        vpmovb2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     0.33                        vpmovw2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     0.33                        vpmovb2m	%ymm0, %k0
@@ -877,10 +945,42 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     13.33  131.33  -     243.33 95.00  95.00
+# CHECK-NEXT:  -      -     18.67  132.67 8.00   248.67 105.00 105.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu8	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu8	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu8	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu8	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu8	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu8	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu8	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu8	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu8	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu8	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu8	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu16	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu16	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu16	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu16	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu16	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu16	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu16	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu16	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu16	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu16	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu16	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsb	%xmm16, %xmm19
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   vpabsb	(%rax), %xmm19
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     vpabsb	%xmm16, %xmm19 {%k1}
@@ -1263,8 +1363,8 @@
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2b	%k0, %xmm0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2w	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2b	%k0, %ymm0
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2w	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vpmovm2b	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vpmovm2w	%k0, %ymm0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovb2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovw2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovb2m	%ymm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512dq.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512dq.s
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512dq.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512dq.s
@@ -336,8 +336,8 @@
 # CHECK-NEXT:  1      1     1.00                        vxorps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vxorps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vxorps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.33                        vpmovm2d	%k0, %zmm0
-# CHECK-NEXT:  1      1     0.33                        vpmovm2q	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2d	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2q	%k0, %zmm0
 # CHECK-NEXT:  1      1     0.33                        vpmovd2m	%zmm0, %k0
 # CHECK-NEXT:  1      1     0.33                        vpmovq2m	%zmm0, %k0
 
@@ -353,7 +353,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     9.67   41.67   -     110.67 46.50  46.50
+# CHECK-NEXT:  -      -     10.00  41.00   -     111.00 46.50  46.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -506,7 +506,7 @@
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vxorps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2d	%k0, %zmm0
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2q	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vpmovm2d	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vpmovm2q	%k0, %zmm0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovd2m	%zmm0, %k0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovq2m	%zmm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512dqvl.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512dqvl.s
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512dqvl.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512dqvl.s
@@ -459,8 +459,8 @@
 # CHECK-NEXT:  2      8     1.00    *                   vxorps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     0.33                        vpmovm2d	%k0, %xmm0
 # CHECK-NEXT:  1      1     0.33                        vpmovm2q	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.33                        vpmovm2d	%k0, %ymm0
-# CHECK-NEXT:  1      1     0.33                        vpmovm2q	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2d	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2q	%k0, %ymm0
 # CHECK-NEXT:  1      1     0.33                        vpmovd2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     0.33                        vpmovq2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     0.33                        vpmovd2m	%ymm0, %k0
@@ -478,7 +478,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     14.67  46.17   -     169.17 66.50  66.50
+# CHECK-NEXT:  -      -     15.00  45.50   -     169.50 66.50  66.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -688,8 +688,8 @@
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vxorps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2d	%k0, %xmm0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2q	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2d	%k0, %ymm0
-# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovm2q	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vpmovm2d	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vpmovm2q	%k0, %ymm0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovd2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovq2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vpmovd2m	%ymm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
@@ -308,6 +308,42 @@
 vminps            (%rax), %ymm17, %ymm19 {z}{k1}
 vminps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
+vmovapd           %xmm16, %xmm19
+vmovapd           (%rax), %xmm19
+vmovapd           %xmm16, (%rax)
+vmovapd           %xmm16, %xmm19 {k1}
+vmovapd           (%rax), %xmm19 {k1}
+vmovapd           %xmm16, (%rax) {k1}
+vmovapd           %xmm16, %xmm19 {z}{k1}
+vmovapd           (%rax), %xmm19 {z}{k1}
+
+vmovapd           %ymm16, %ymm19
+vmovapd           (%rax), %ymm19
+vmovapd           %ymm16, (%rax)
+vmovapd           %ymm16, %ymm19 {k1}
+vmovapd           (%rax), %ymm19 {k1}
+vmovapd           %ymm16, (%rax) {k1}
+vmovapd           %ymm16, %ymm19 {z}{k1}
+vmovapd           (%rax), %ymm19 {z}{k1}
+
+vmovaps           %xmm16, %xmm19
+vmovaps           (%rax), %xmm19
+vmovaps           %xmm16, (%rax)
+vmovaps           %xmm16, %xmm19 {k1}
+vmovaps           (%rax), %xmm19 {k1}
+vmovaps           %xmm16, (%rax) {k1}
+vmovaps           %xmm16, %xmm19 {z}{k1}
+vmovaps           (%rax), %xmm19 {z}{k1}
+
+vmovaps           %ymm16, %ymm19
+vmovaps           (%rax), %ymm19
+vmovaps           %ymm16, (%rax)
+vmovaps           %ymm16, %ymm19 {k1}
+vmovaps           (%rax), %ymm19 {k1}
+vmovaps           %ymm16, (%rax) {k1}
+vmovaps           %ymm16, %ymm19 {z}{k1}
+vmovaps           (%rax), %ymm19 {z}{k1}
+
 vmovddup          %xmm16, %xmm19
 vmovddup          (%rax), %xmm19
 vmovddup          %xmm16, %xmm19 {k1}
@@ -315,6 +351,78 @@
 vmovddup          %xmm16, %xmm19 {z}{k1}
 vmovddup          (%rax), %xmm19 {z}{k1}
 
+vmovdqa32         %xmm16, %xmm19
+vmovdqa32         (%rax), %xmm19
+vmovdqa32         %xmm16, (%rax)
+vmovdqa32         %xmm16, %xmm19 {k1}
+vmovdqa32         (%rax), %xmm19 {k1}
+vmovdqa32         %xmm16, (%rax) {k1}
+vmovdqa32         %xmm16, %xmm19 {z}{k1}
+vmovdqa32         (%rax), %xmm19 {z}{k1}
+
+vmovdqa32         %ymm16, %ymm19
+vmovdqa32         (%rax), %ymm19
+vmovdqa32         %ymm16, (%rax)
+vmovdqa32         %ymm16, %ymm19 {k1}
+vmovdqa32         (%rax), %ymm19 {k1}
+vmovdqa32         %ymm16, (%rax) {k1}
+vmovdqa32         %ymm16, %ymm19 {z}{k1}
+vmovdqa32         (%rax), %ymm19 {z}{k1}
+
+vmovdqa64         %xmm16, %xmm19
+vmovdqa64         (%rax), %xmm19
+vmovdqa64         %xmm16, (%rax)
+vmovdqa64         %xmm16, %xmm19 {k1}
+vmovdqa64         (%rax), %xmm19 {k1}
+vmovdqa64         %xmm16, (%rax) {k1}
+vmovdqa64         %xmm16, %xmm19 {z}{k1}
+vmovdqa64         (%rax), %xmm19 {z}{k1}
+
+vmovdqa64         %ymm16, %ymm19
+vmovdqa64         (%rax), %ymm19
+vmovdqa64         %ymm16, (%rax)
+vmovdqa64         %ymm16, %ymm19 {k1}
+vmovdqa64         (%rax), %ymm19 {k1}
+vmovdqa64         %ymm16, (%rax) {k1}
+vmovdqa64         %ymm16, %ymm19 {z}{k1}
+vmovdqa64         (%rax), %ymm19 {z}{k1}
+
+vmovdqu32         %xmm16, %xmm19
+vmovdqu32         (%rax), %xmm19
+vmovdqu32         %xmm16, (%rax)
+vmovdqu32         %xmm16, %xmm19 {k1}
+vmovdqu32         (%rax), %xmm19 {k1}
+vmovdqu32         %xmm16, (%rax) {k1}
+vmovdqu32         %xmm16, %xmm19 {z}{k1}
+vmovdqu32         (%rax), %xmm19 {z}{k1}
+
+vmovdqu32         %ymm16, %ymm19
+vmovdqu32         (%rax), %ymm19
+vmovdqu32         %ymm16, (%rax)
+vmovdqu32         %ymm16, %ymm19 {k1}
+vmovdqu32         (%rax), %ymm19 {k1}
+vmovdqu32         %ymm16, (%rax) {k1}
+vmovdqu32         %ymm16, %ymm19 {z}{k1}
+vmovdqu32         (%rax), %ymm19 {z}{k1}
+
+vmovdqu64         %xmm16, %xmm19
+vmovdqu64         (%rax), %xmm19
+vmovdqu64         %xmm16, (%rax)
+vmovdqu64         %xmm16, %xmm19 {k1}
+vmovdqu64         (%rax), %xmm19 {k1}
+vmovdqu64         %xmm16, (%rax) {k1}
+vmovdqu64         %xmm16, %xmm19 {z}{k1}
+vmovdqu64         (%rax), %xmm19 {z}{k1}
+
+vmovdqu64         %ymm16, %ymm19
+vmovdqu64         (%rax), %ymm19
+vmovdqu64         %ymm16, (%rax)
+vmovdqu64         %ymm16, %ymm19 {k1}
+vmovdqu64         (%rax), %ymm19 {k1}
+vmovdqu64         %ymm16, (%rax) {k1}
+vmovdqu64         %ymm16, %ymm19 {z}{k1}
+vmovdqu64         (%rax), %ymm19 {z}{k1}
+
 vmovddup          %ymm16, %ymm19
 vmovddup          (%rax), %ymm19
 vmovddup          %ymm16, %ymm19 {k1}
@@ -350,6 +458,42 @@
 vmovsldup         %ymm16, %ymm19 {z}{k1}
 vmovsldup         (%rax), %ymm19 {z}{k1}
 
+vmovupd           %xmm16, %xmm19
+vmovupd           (%rax), %xmm19
+vmovupd           %xmm16, (%rax)
+vmovupd           %xmm16, %xmm19 {k1}
+vmovupd           (%rax), %xmm19 {k1}
+vmovupd           %xmm16, (%rax) {k1}
+vmovupd           %xmm16, %xmm19 {z}{k1}
+vmovupd           (%rax), %xmm19 {z}{k1}
+
+vmovupd           %ymm16, %ymm19
+vmovupd           (%rax), %ymm19
+vmovupd           %ymm16, (%rax)
+vmovupd           %ymm16, %ymm19 {k1}
+vmovupd           (%rax), %ymm19 {k1}
+vmovupd           %ymm16, (%rax) {k1}
+vmovupd           %ymm16, %ymm19 {z}{k1}
+vmovupd           (%rax), %ymm19 {z}{k1}
+
+vmovups           %xmm16, %xmm19
+vmovups           (%rax), %xmm19
+vmovups           %xmm16, (%rax)
+vmovups           %xmm16, %xmm19 {k1}
+vmovups           (%rax), %xmm19 {k1}
+vmovups           %xmm16, (%rax) {k1}
+vmovups           %xmm16, %xmm19 {z}{k1}
+vmovups           (%rax), %xmm19 {z}{k1}
+
+vmovups           %ymm16, %ymm19
+vmovups           (%rax), %ymm19
+vmovups           %ymm16, (%rax)
+vmovups           %ymm16, %ymm19 {k1}
+vmovups           (%rax), %ymm19 {k1}
+vmovups           %ymm16, (%rax) {k1}
+vmovups           %ymm16, %ymm19 {z}{k1}
+vmovups           (%rax), %ymm19 {z}{k1}
+
 vmulpd            %xmm16, %xmm17, %xmm19
 vmulpd            (%rax), %xmm17, %xmm19
 vmulpd            (%rax){1to2}, %xmm17, %xmm19
@@ -1517,12 +1661,108 @@
 # CHECK-NEXT:  1      3     1.00                        vminps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      10    1.00    *                   vminps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      10    1.00    *                   vminps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovapd	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovapd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovapd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovapd	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovapd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovapd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovapd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovaps	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovaps	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovaps	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovaps	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovaps	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     1.00    *                   vmovddup	(%rax), %xmm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      7     1.00    *                   vmovddup	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  2      7     1.00    *                   vmovddup	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqa32	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa32	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqa32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqa32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa32	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa32	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqa64	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa64	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqa64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqa64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa64	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa64	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqa64	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu32	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu32	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu32	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu32	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu64	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu64	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovdqu64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu64	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu64	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovdqu64	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19
 # CHECK-NEXT:  2      8     1.00    *                   vmovddup	(%rax), %ymm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19 {%k1}
@@ -1553,6 +1793,38 @@
 # CHECK-NEXT:  2      8     1.00    *                   vmovsldup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovsldup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vmovsldup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovupd	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovupd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovupd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovupd	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovupd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovupd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovupd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovups	%xmm16, %xmm19
+# CHECK-NEXT:  1      6     0.50    *                   vmovups	(%rax), %xmm19
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovups	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  1      6     0.50    *                   vmovups	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovups	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   vmovups	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     1.00                        vmovups	%ymm16, %ymm19
+# CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %ymm19
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     1.00                        vmovups	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     1.00                        vmovups	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      5     1.00                        vmulpd	%xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %xmm17, %xmm19
 # CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax){1to2}, %xmm17, %xmm19
@@ -2348,7 +2620,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     1935.00 196.00 359.50  -    608.50 350.50 350.50
+# CHECK-NEXT:  -     1935.00 206.00 363.50 32.00 642.50 390.50 390.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -2624,12 +2896,108 @@
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     vminps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   vminps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%xmm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%ymm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovapd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovapd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovapd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%xmm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%ymm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovaps	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovaps	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovaps	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%xmm16, %xmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %xmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqa32	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa32	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqa32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqa32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa32	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa32	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqa64	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa64	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqa64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqa64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa64	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa64	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqa64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqa64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqa64	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu32	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu32	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu32	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu32	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu64	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu64	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -     0.33    -      -     vmovdqu64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu64	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu64	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%ymm16, %ymm19
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %ymm19
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%ymm16, %ymm19 {%k1}
@@ -2660,6 +3028,38 @@
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovsldup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovsldup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovsldup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%xmm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%ymm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovupd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovupd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovupd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%xmm16, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%xmm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%ymm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%ymm16, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovups	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovups	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovups	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vmulpd	%xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulpd	(%rax), %xmm17, %xmm19
 # CHECK-NEXT:  -      -     1.00    -      -      -     0.50   0.50   vmulpd	(%rax){1to2}, %xmm17, %xmm19
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
@@ -212,6 +212,24 @@
 vminps            (%rax), %zmm17, %zmm19 {z}{k1}
 vminps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
+vmovapd           %zmm16, %zmm19
+vmovapd           (%rax), %zmm19
+vmovapd           %zmm16, (%rax)
+vmovapd           %zmm16, %zmm19 {k1}
+vmovapd           (%rax), %zmm19 {k1}
+vmovapd           %zmm16, (%rax) {k1}
+vmovapd           %zmm16, %zmm19 {z}{k1}
+vmovapd           (%rax), %zmm19 {z}{k1}
+
+vmovaps           %zmm16, %zmm19
+vmovaps           (%rax), %zmm19
+vmovaps           %zmm16, (%rax)
+vmovaps           %zmm16, %zmm19 {k1}
+vmovaps           (%rax), %zmm19 {k1}
+vmovaps           %zmm16, (%rax) {k1}
+vmovaps           %zmm16, %zmm19 {z}{k1}
+vmovaps           (%rax), %zmm19 {z}{k1}
+
 vmovddup          %zmm16, %zmm19
 vmovddup          (%rax), %zmm19
 vmovddup          %zmm16, %zmm19 {k1}
@@ -219,6 +237,42 @@
 vmovddup          %zmm16, %zmm19 {z}{k1}
 vmovddup          (%rax), %zmm19 {z}{k1}
 
+vmovdqa32         %zmm16, %zmm19
+vmovdqa32         (%rax), %zmm19
+vmovdqa32         %zmm16, (%rax)
+vmovdqa32         %zmm16, %zmm19 {k1}
+vmovdqa32         (%rax), %zmm19 {k1}
+vmovdqa32         %zmm16, (%rax) {k1}
+vmovdqa32         %zmm16, %zmm19 {z}{k1}
+vmovdqa32         (%rax), %zmm19 {z}{k1}
+
+vmovdqa64         %zmm16, %zmm19
+vmovdqa64         (%rax), %zmm19
+vmovdqa64         %zmm16, (%rax)
+vmovdqa64         %zmm16, %zmm19 {k1}
+vmovdqa64         (%rax), %zmm19 {k1}
+vmovdqa64         %zmm16, (%rax) {k1}
+vmovdqa64         %zmm16, %zmm19 {z}{k1}
+vmovdqa64         (%rax), %zmm19 {z}{k1}
+
+vmovdqu32         %zmm16, %zmm19
+vmovdqu32         (%rax), %zmm19
+vmovdqu32         %zmm16, (%rax)
+vmovdqu32         %zmm16, %zmm19 {k1}
+vmovdqu32         (%rax), %zmm19 {k1}
+vmovdqu32         %zmm16, (%rax) {k1}
+vmovdqu32         %zmm16, %zmm19 {z}{k1}
+vmovdqu32         (%rax), %zmm19 {z}{k1}
+
+vmovdqu64         %zmm16, %zmm19
+vmovdqu64         (%rax), %zmm19
+vmovdqu64         %zmm16, (%rax)
+vmovdqu64         %zmm16, %zmm19 {k1}
+vmovdqu64         (%rax), %zmm19 {k1}
+vmovdqu64         %zmm16, (%rax) {k1}
+vmovdqu64         %zmm16, %zmm19 {z}{k1}
+vmovdqu64         (%rax), %zmm19 {z}{k1}
+
 vmovshdup         %zmm16, %zmm19
 vmovshdup         (%rax), %zmm19
 vmovshdup         %zmm16, %zmm19 {k1}
@@ -233,6 +287,24 @@
 vmovsldup         %zmm16, %zmm19 {z}{k1}
 vmovsldup         (%rax), %zmm19 {z}{k1}
 
+vmovupd           %zmm16, %zmm19
+vmovupd           (%rax), %zmm19
+vmovupd           %zmm16, (%rax)
+vmovupd           %zmm16, %zmm19 {k1}
+vmovupd           (%rax), %zmm19 {k1}
+vmovupd           %zmm16, (%rax) {k1}
+vmovupd           %zmm16, %zmm19 {z}{k1}
+vmovupd           (%rax), %zmm19 {z}{k1}
+
+vmovups           %zmm16, %zmm19
+vmovups           (%rax), %zmm19
+vmovups           %zmm16, (%rax)
+vmovups           %zmm16, %zmm19 {k1}
+vmovups           (%rax), %zmm19 {k1}
+vmovups           %zmm16, (%rax) {k1}
+vmovups           %zmm16, %zmm19 {z}{k1}
+vmovups           (%rax), %zmm19 {z}{k1}
+
 vmulpd            %zmm16, %zmm17, %zmm19
 vmulpd            (%rax), %zmm17, %zmm19
 vmulpd            (%rax){1to8}, %zmm17, %zmm19
@@ -996,12 +1068,60 @@
 # CHECK-NEXT:  1      4     0.50                        vminps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      11    0.50    *                   vminps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      11    0.50    *                   vminps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19 {%k1}
@@ -1014,6 +1134,22 @@
 # CHECK-NEXT:  2      8     0.50    *                   vmovsldup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovsldup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovsldup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovups	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovups	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovups	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax), %zmm17, %zmm19
 # CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19
@@ -1535,7 +1671,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     612.00 220.67 41.67  261.50 261.50  -     539.67 2.00    -      -      -
+# CHECK-NEXT:  -     612.00 240.67 49.67  278.83 278.83 16.00  559.67 2.00   5.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -1723,12 +1859,60 @@
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vminps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vminps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vminps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovapd	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovapd	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovapd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovapd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovapd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovaps	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovaps	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovaps	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovaps	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovaps	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqa32	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa32	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqa32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqa32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqa64	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa64	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqa64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqa64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu32	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu32	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu64	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu64	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovshdup	%zmm16, %zmm19 {%k1}
@@ -1741,6 +1925,22 @@
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovsldup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovsldup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovsldup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovupd	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovupd	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovupd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovupd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovupd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovups	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovups	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovups	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovups	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovups	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512bw.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512bw.s
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512bw.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512bw.s
@@ -20,6 +20,24 @@
 kunpckdq          %k0, %k1, %k2
 kunpckwd          %k0, %k1, %k2
 
+vmovdqu8          %zmm16, %zmm19
+vmovdqu8          (%rax), %zmm19
+vmovdqu8          %zmm16, (%rax)
+vmovdqu8          %zmm16, %zmm19 {k1}
+vmovdqu8          (%rax), %zmm19 {k1}
+vmovdqu8          %zmm16, (%rax) {k1}
+vmovdqu8          %zmm16, %zmm19 {z}{k1}
+vmovdqu8          (%rax), %zmm19 {z}{k1}
+
+vmovdqu16         %zmm16, %zmm19
+vmovdqu16         (%rax), %zmm19
+vmovdqu16         %zmm16, (%rax)
+vmovdqu16         %zmm16, %zmm19 {k1}
+vmovdqu16         (%rax), %zmm19 {k1}
+vmovdqu16         %zmm16, (%rax) {k1}
+vmovdqu16         %zmm16, %zmm19 {z}{k1}
+vmovdqu16         (%rax), %zmm19 {z}{k1}
+
 vpabsb            %zmm16, %zmm19
 vpabsb            (%rax), %zmm19
 vpabsb            %zmm16, %zmm19 {k1}
@@ -280,6 +298,22 @@
 # CHECK-NEXT:  1      4     1.00                        kshiftrq	$2, %k1, %k2
 # CHECK-NEXT:  1      4     1.00                        kunpckdq	%k0, %k1, %k2
 # CHECK-NEXT:  1      4     1.00                        kunpckwd	%k0, %k1, %k2
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %zmm19
+# CHECK-NEXT:  5      2     2.00           *            vmovdqu8	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  5      2     2.00           *            vmovdqu8	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vpabsb	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vpabsb	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vpabsb	%zmm16, %zmm19 {%k1}
@@ -470,8 +504,8 @@
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpunpcklwd	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.25                        vpmovm2b	%k0, %zmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2w	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2b	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2w	%k0, %zmm0
 # CHECK-NEXT:  1      1     1.00                        vpmovb2m	%zmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovw2m	%zmm0, %k0
 
@@ -491,7 +525,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     32.50  8.50   47.50  47.50   -     176.50 0.50    -      -      -
+# CHECK-NEXT:  -      -     38.00  11.00  52.50  52.50  6.00   183.00  -     2.00    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -513,6 +547,22 @@
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kshiftrq	$2, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kunpckdq	%k0, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kunpckwd	%k0, %k1, %k2
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu8	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -     0.50   0.67   0.67   2.00   0.50    -     0.67    -      -     vmovdqu8	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu8	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -     0.50   0.67   0.67   2.00   0.50    -     0.67    -      -     vmovdqu8	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu8	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu16	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu16	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu16	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu16	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu16	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpabsb	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vpabsb	(%rax), %zmm19
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpabsb	%zmm16, %zmm19 {%k1}
@@ -703,7 +753,7 @@
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vpunpcklwd	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2b	%k0, %zmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2w	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vpmovm2b	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vpmovm2w	%k0, %zmm0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovb2m	%zmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovw2m	%zmm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512bwvl.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512bwvl.s
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512bwvl.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512bwvl.s
@@ -1,6 +1,42 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=icelake-server -instruction-tables < %s | FileCheck %s
 
+vmovdqu8          %xmm16, %xmm19
+vmovdqu8          (%rax), %xmm19
+vmovdqu8          %xmm16, (%rax)
+vmovdqu8          %xmm16, %xmm19 {k1}
+vmovdqu8          (%rax), %xmm19 {k1}
+vmovdqu8          %xmm16, (%rax) {k1}
+vmovdqu8          %xmm16, %xmm19 {z}{k1}
+vmovdqu8          (%rax), %xmm19 {z}{k1}
+
+vmovdqu8          %ymm16, %ymm19
+vmovdqu8          (%rax), %ymm19
+vmovdqu8          %ymm16, (%rax)
+vmovdqu8          %ymm16, %ymm19 {k1}
+vmovdqu8          (%rax), %ymm19 {k1}
+vmovdqu8          %ymm16, (%rax) {k1}
+vmovdqu8          %ymm16, %ymm19 {z}{k1}
+vmovdqu8          (%rax), %ymm19 {z}{k1}
+
+vmovdqu16         %xmm16, %xmm19
+vmovdqu16         (%rax), %xmm19
+vmovdqu16         %xmm16, (%rax)
+vmovdqu16         %xmm16, %xmm19 {k1}
+vmovdqu16         (%rax), %xmm19 {k1}
+vmovdqu16         %xmm16, (%rax) {k1}
+vmovdqu16         %xmm16, %xmm19 {z}{k1}
+vmovdqu16         (%rax), %xmm19 {z}{k1}
+
+vmovdqu16         %ymm16, %ymm19
+vmovdqu16         (%rax), %ymm19
+vmovdqu16         %ymm16, (%rax)
+vmovdqu16         %ymm16, %ymm19 {k1}
+vmovdqu16         (%rax), %ymm19 {k1}
+vmovdqu16         %ymm16, (%rax) {k1}
+vmovdqu16         %ymm16, %ymm19 {z}{k1}
+vmovdqu16         (%rax), %ymm19 {z}{k1}
+
 vpabsb            %xmm16, %xmm19
 vpabsb            (%rax), %xmm19
 vpabsb            %xmm16, %xmm19 {k1}
@@ -476,6 +512,38 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu8	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu8	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu8	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu8	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu8	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu8	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu8	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu16	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu16	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu16	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     0.50    *                   vpabsb	(%rax), %xmm19
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%xmm16, %xmm19 {%k1}
@@ -856,10 +924,10 @@
 # CHECK-NEXT:  2      8     0.50    *                   vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1}
 # CHECK-NEXT:  1      1     0.50                        vpunpcklwd	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.25                        vpmovm2b	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2w	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2b	%k0, %ymm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2w	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2b	%k0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2w	%k0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2b	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2w	%k0, %ymm0
 # CHECK-NEXT:  1      1     1.00                        vpmovb2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovw2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovb2m	%ymm0, %k0
@@ -881,10 +949,42 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     33.00  81.00  95.00  95.00   -     285.00 1.00    -      -      -
+# CHECK-NEXT:  -      -     41.33  89.33  103.67 103.67 8.00   293.33  -     2.67    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu8	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu8	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu8	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu8	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu8	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu8	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu8	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu8	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu8	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu8	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu8	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu16	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu16	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu16	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu16	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu16	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu16	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu16	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu16	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu16	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu16	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu16	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vpabsb	%xmm16, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vpabsb	(%rax), %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vpabsb	%xmm16, %xmm19 {%k1}
@@ -1265,10 +1365,10 @@
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -      -      -     vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -      -      -     vpunpcklwd	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -      -      -     vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2b	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2w	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2b	%k0, %ymm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2w	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vpmovm2b	%k0, %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vpmovm2w	%k0, %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vpmovm2b	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vpmovm2w	%k0, %ymm0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovb2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovw2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovb2m	%ymm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
@@ -336,8 +336,8 @@
 # CHECK-NEXT:  1      1     0.50                        vxorps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.25                        vpmovm2d	%k0, %zmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2q	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2d	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2q	%k0, %zmm0
 # CHECK-NEXT:  1      1     1.00                        vpmovd2m	%zmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovq2m	%zmm0, %k0
 
@@ -357,7 +357,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     65.50  10.50  46.50  46.50   -     95.50  0.50    -      -      -
+# CHECK-NEXT:  -      -     66.00  10.00  46.50  46.50   -     96.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -510,7 +510,7 @@
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vxorps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vxorps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vxorps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2d	%k0, %zmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2q	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vpmovm2d	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vpmovm2q	%k0, %zmm0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovd2m	%zmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovq2m	%zmm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dqvl.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dqvl.s
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dqvl.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dqvl.s
@@ -457,10 +457,10 @@
 # CHECK-NEXT:  1      1     0.33                        vxorps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.25                        vpmovm2d	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2q	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2d	%k0, %ymm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2q	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2d	%k0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2q	%k0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2d	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2q	%k0, %ymm0
 # CHECK-NEXT:  1      1     1.00                        vpmovd2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovq2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovd2m	%ymm0, %k0
@@ -482,7 +482,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     77.00  73.00  66.50  66.50   -     85.00  1.00    -      -      -
+# CHECK-NEXT:  -      -     77.33  73.33  66.50  66.50   -     85.33   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -690,10 +690,10 @@
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vxorps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vxorps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vxorps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2d	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2q	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2d	%k0, %ymm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -      -      -     vpmovm2q	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vpmovm2d	%k0, %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vpmovm2q	%k0, %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vpmovm2d	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vpmovm2q	%k0, %ymm0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovd2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovq2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpmovd2m	%ymm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
@@ -308,6 +308,42 @@
 vminps            (%rax), %ymm17, %ymm19 {z}{k1}
 vminps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
+vmovapd           %xmm16, %xmm19
+vmovapd           (%rax), %xmm19
+vmovapd           %xmm16, (%rax)
+vmovapd           %xmm16, %xmm19 {k1}
+vmovapd           (%rax), %xmm19 {k1}
+vmovapd           %xmm16, (%rax) {k1}
+vmovapd           %xmm16, %xmm19 {z}{k1}
+vmovapd           (%rax), %xmm19 {z}{k1}
+
+vmovapd           %ymm16, %ymm19
+vmovapd           (%rax), %ymm19
+vmovapd           %ymm16, (%rax)
+vmovapd           %ymm16, %ymm19 {k1}
+vmovapd           (%rax), %ymm19 {k1}
+vmovapd           %ymm16, (%rax) {k1}
+vmovapd           %ymm16, %ymm19 {z}{k1}
+vmovapd           (%rax), %ymm19 {z}{k1}
+
+vmovaps           %xmm16, %xmm19
+vmovaps           (%rax), %xmm19
+vmovaps           %xmm16, (%rax)
+vmovaps           %xmm16, %xmm19 {k1}
+vmovaps           (%rax), %xmm19 {k1}
+vmovaps           %xmm16, (%rax) {k1}
+vmovaps           %xmm16, %xmm19 {z}{k1}
+vmovaps           (%rax), %xmm19 {z}{k1}
+
+vmovaps           %ymm16, %ymm19
+vmovaps           (%rax), %ymm19
+vmovaps           %ymm16, (%rax)
+vmovaps           %ymm16, %ymm19 {k1}
+vmovaps           (%rax), %ymm19 {k1}
+vmovaps           %ymm16, (%rax) {k1}
+vmovaps           %ymm16, %ymm19 {z}{k1}
+vmovaps           (%rax), %ymm19 {z}{k1}
+
 vmovddup          %xmm16, %xmm19
 vmovddup          (%rax), %xmm19
 vmovddup          %xmm16, %xmm19 {k1}
@@ -315,6 +351,78 @@
 vmovddup          %xmm16, %xmm19 {z}{k1}
 vmovddup          (%rax), %xmm19 {z}{k1}
 
+vmovdqa32         %xmm16, %xmm19
+vmovdqa32         (%rax), %xmm19
+vmovdqa32         %xmm16, (%rax)
+vmovdqa32         %xmm16, %xmm19 {k1}
+vmovdqa32         (%rax), %xmm19 {k1}
+vmovdqa32         %xmm16, (%rax) {k1}
+vmovdqa32         %xmm16, %xmm19 {z}{k1}
+vmovdqa32         (%rax), %xmm19 {z}{k1}
+
+vmovdqa32         %ymm16, %ymm19
+vmovdqa32         (%rax), %ymm19
+vmovdqa32         %ymm16, (%rax)
+vmovdqa32         %ymm16, %ymm19 {k1}
+vmovdqa32         (%rax), %ymm19 {k1}
+vmovdqa32         %ymm16, (%rax) {k1}
+vmovdqa32         %ymm16, %ymm19 {z}{k1}
+vmovdqa32         (%rax), %ymm19 {z}{k1}
+
+vmovdqa64         %xmm16, %xmm19
+vmovdqa64         (%rax), %xmm19
+vmovdqa64         %xmm16, (%rax)
+vmovdqa64         %xmm16, %xmm19 {k1}
+vmovdqa64         (%rax), %xmm19 {k1}
+vmovdqa64         %xmm16, (%rax) {k1}
+vmovdqa64         %xmm16, %xmm19 {z}{k1}
+vmovdqa64         (%rax), %xmm19 {z}{k1}
+
+vmovdqa64         %ymm16, %ymm19
+vmovdqa64         (%rax), %ymm19
+vmovdqa64         %ymm16, (%rax)
+vmovdqa64         %ymm16, %ymm19 {k1}
+vmovdqa64         (%rax), %ymm19 {k1}
+vmovdqa64         %ymm16, (%rax) {k1}
+vmovdqa64         %ymm16, %ymm19 {z}{k1}
+vmovdqa64         (%rax), %ymm19 {z}{k1}
+
+vmovdqu32         %xmm16, %xmm19
+vmovdqu32         (%rax), %xmm19
+vmovdqu32         %xmm16, (%rax)
+vmovdqu32         %xmm16, %xmm19 {k1}
+vmovdqu32         (%rax), %xmm19 {k1}
+vmovdqu32         %xmm16, (%rax) {k1}
+vmovdqu32         %xmm16, %xmm19 {z}{k1}
+vmovdqu32         (%rax), %xmm19 {z}{k1}
+
+vmovdqu32         %ymm16, %ymm19
+vmovdqu32         (%rax), %ymm19
+vmovdqu32         %ymm16, (%rax)
+vmovdqu32         %ymm16, %ymm19 {k1}
+vmovdqu32         (%rax), %ymm19 {k1}
+vmovdqu32         %ymm16, (%rax) {k1}
+vmovdqu32         %ymm16, %ymm19 {z}{k1}
+vmovdqu32         (%rax), %ymm19 {z}{k1}
+
+vmovdqu64         %xmm16, %xmm19
+vmovdqu64         (%rax), %xmm19
+vmovdqu64         %xmm16, (%rax)
+vmovdqu64         %xmm16, %xmm19 {k1}
+vmovdqu64         (%rax), %xmm19 {k1}
+vmovdqu64         %xmm16, (%rax) {k1}
+vmovdqu64         %xmm16, %xmm19 {z}{k1}
+vmovdqu64         (%rax), %xmm19 {z}{k1}
+
+vmovdqu64         %ymm16, %ymm19
+vmovdqu64         (%rax), %ymm19
+vmovdqu64         %ymm16, (%rax)
+vmovdqu64         %ymm16, %ymm19 {k1}
+vmovdqu64         (%rax), %ymm19 {k1}
+vmovdqu64         %ymm16, (%rax) {k1}
+vmovdqu64         %ymm16, %ymm19 {z}{k1}
+vmovdqu64         (%rax), %ymm19 {z}{k1}
+
 vmovddup          %ymm16, %ymm19
 vmovddup          (%rax), %ymm19
 vmovddup          %ymm16, %ymm19 {k1}
@@ -350,6 +458,42 @@
 vmovsldup         %ymm16, %ymm19 {z}{k1}
 vmovsldup         (%rax), %ymm19 {z}{k1}
 
+vmovupd           %xmm16, %xmm19
+vmovupd           (%rax), %xmm19
+vmovupd           %xmm16, (%rax)
+vmovupd           %xmm16, %xmm19 {k1}
+vmovupd           (%rax), %xmm19 {k1}
+vmovupd           %xmm16, (%rax) {k1}
+vmovupd           %xmm16, %xmm19 {z}{k1}
+vmovupd           (%rax), %xmm19 {z}{k1}
+
+vmovupd           %ymm16, %ymm19
+vmovupd           (%rax), %ymm19
+vmovupd           %ymm16, (%rax)
+vmovupd           %ymm16, %ymm19 {k1}
+vmovupd           (%rax), %ymm19 {k1}
+vmovupd           %ymm16, (%rax) {k1}
+vmovupd           %ymm16, %ymm19 {z}{k1}
+vmovupd           (%rax), %ymm19 {z}{k1}
+
+vmovups           %xmm16, %xmm19
+vmovups           (%rax), %xmm19
+vmovups           %xmm16, (%rax)
+vmovups           %xmm16, %xmm19 {k1}
+vmovups           (%rax), %xmm19 {k1}
+vmovups           %xmm16, (%rax) {k1}
+vmovups           %xmm16, %xmm19 {z}{k1}
+vmovups           (%rax), %xmm19 {z}{k1}
+
+vmovups           %ymm16, %ymm19
+vmovups           (%rax), %ymm19
+vmovups           %ymm16, (%rax)
+vmovups           %ymm16, %ymm19 {k1}
+vmovups           (%rax), %ymm19 {k1}
+vmovups           %ymm16, (%rax) {k1}
+vmovups           %ymm16, %ymm19 {z}{k1}
+vmovups           (%rax), %ymm19 {z}{k1}
+
 vmulpd            %xmm16, %xmm17, %xmm19
 vmulpd            (%rax), %xmm17, %xmm19
 vmulpd            (%rax){1to2}, %xmm17, %xmm19
@@ -1517,12 +1661,108 @@
 # CHECK-NEXT:  1      4     0.50                        vminps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      11    0.50    *                   vminps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      11    0.50    *                   vminps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovapd	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovapd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovapd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovaps	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovaps	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovaps	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     0.50    *                   vmovddup	(%rax), %xmm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      7     0.50    *                   vmovddup	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  2      7     0.50    *                   vmovddup	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa32	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa64	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu32	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu64	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %ymm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19 {%k1}
@@ -1553,6 +1793,38 @@
 # CHECK-NEXT:  2      8     0.50    *                   vmovsldup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     0.50                        vmovsldup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovsldup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovupd	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovupd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovupd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovups	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovups	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovups	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovups	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovups	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovups	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovups	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovups	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovups	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      4     0.50                        vmulpd	%xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  2      10    0.50    *                   vmulpd	(%rax), %xmm17, %xmm19
 # CHECK-NEXT:  2      10    0.50    *                   vmulpd	(%rax){1to2}, %xmm17, %xmm19
@@ -2352,7 +2624,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     423.00 257.33 232.33 372.50 372.50  -     652.33 4.00    -      -      -
+# CHECK-NEXT:  -     423.00 289.33 264.33 407.17 407.17 32.00  684.33 4.00   10.67   -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2628,12 +2900,108 @@
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vminps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vminps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vminps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovapd	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovapd	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovapd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovapd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovapd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovapd	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovapd	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovapd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovapd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovapd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovapd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovaps	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovaps	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovaps	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovaps	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovaps	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovaps	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovaps	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovaps	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovaps	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovaps	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovaps	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%xmm16, %xmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %xmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa32	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa32	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa32	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa32	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa64	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa64	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa64	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa64	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqa64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqa64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqa64	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu32	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu32	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu32	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu32	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu64	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu64	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu64	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu64	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovdqu64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovdqu64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%ymm16, %ymm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%ymm16, %ymm19 {%k1}
@@ -2664,6 +3032,38 @@
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovsldup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -      -      -     vmovsldup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovsldup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovupd	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovupd	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovupd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovupd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovupd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovupd	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovupd	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovupd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovupd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovupd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovupd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovups	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovups	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovups	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovups	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovups	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovups	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovups	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovups	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33    -      -     vmovups	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovups	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vmulpd	%xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %xmm17, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax){1to2}, %xmm17, %xmm19
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
@@ -212,6 +212,24 @@
 vminps            (%rax), %zmm17, %zmm19 {z}{k1}
 vminps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
+vmovapd           %zmm16, %zmm19
+vmovapd           (%rax), %zmm19
+vmovapd           %zmm16, (%rax)
+vmovapd           %zmm16, %zmm19 {k1}
+vmovapd           (%rax), %zmm19 {k1}
+vmovapd           %zmm16, (%rax) {k1}
+vmovapd           %zmm16, %zmm19 {z}{k1}
+vmovapd           (%rax), %zmm19 {z}{k1}
+
+vmovaps           %zmm16, %zmm19
+vmovaps           (%rax), %zmm19
+vmovaps           %zmm16, (%rax)
+vmovaps           %zmm16, %zmm19 {k1}
+vmovaps           (%rax), %zmm19 {k1}
+vmovaps           %zmm16, (%rax) {k1}
+vmovaps           %zmm16, %zmm19 {z}{k1}
+vmovaps           (%rax), %zmm19 {z}{k1}
+
 vmovddup          %zmm16, %zmm19
 vmovddup          (%rax), %zmm19
 vmovddup          %zmm16, %zmm19 {k1}
@@ -219,6 +237,42 @@
 vmovddup          %zmm16, %zmm19 {z}{k1}
 vmovddup          (%rax), %zmm19 {z}{k1}
 
+vmovdqa32         %zmm16, %zmm19
+vmovdqa32         (%rax), %zmm19
+vmovdqa32         %zmm16, (%rax)
+vmovdqa32         %zmm16, %zmm19 {k1}
+vmovdqa32         (%rax), %zmm19 {k1}
+vmovdqa32         %zmm16, (%rax) {k1}
+vmovdqa32         %zmm16, %zmm19 {z}{k1}
+vmovdqa32         (%rax), %zmm19 {z}{k1}
+
+vmovdqa64         %zmm16, %zmm19
+vmovdqa64         (%rax), %zmm19
+vmovdqa64         %zmm16, (%rax)
+vmovdqa64         %zmm16, %zmm19 {k1}
+vmovdqa64         (%rax), %zmm19 {k1}
+vmovdqa64         %zmm16, (%rax) {k1}
+vmovdqa64         %zmm16, %zmm19 {z}{k1}
+vmovdqa64         (%rax), %zmm19 {z}{k1}
+
+vmovdqu32         %zmm16, %zmm19
+vmovdqu32         (%rax), %zmm19
+vmovdqu32         %zmm16, (%rax)
+vmovdqu32         %zmm16, %zmm19 {k1}
+vmovdqu32         (%rax), %zmm19 {k1}
+vmovdqu32         %zmm16, (%rax) {k1}
+vmovdqu32         %zmm16, %zmm19 {z}{k1}
+vmovdqu32         (%rax), %zmm19 {z}{k1}
+
+vmovdqu64         %zmm16, %zmm19
+vmovdqu64         (%rax), %zmm19
+vmovdqu64         %zmm16, (%rax)
+vmovdqu64         %zmm16, %zmm19 {k1}
+vmovdqu64         (%rax), %zmm19 {k1}
+vmovdqu64         %zmm16, (%rax) {k1}
+vmovdqu64         %zmm16, %zmm19 {z}{k1}
+vmovdqu64         (%rax), %zmm19 {z}{k1}
+
 vmovshdup         %zmm16, %zmm19
 vmovshdup         (%rax), %zmm19
 vmovshdup         %zmm16, %zmm19 {k1}
@@ -233,6 +287,24 @@
 vmovsldup         %zmm16, %zmm19 {z}{k1}
 vmovsldup         (%rax), %zmm19 {z}{k1}
 
+vmovupd           %zmm16, %zmm19
+vmovupd           (%rax), %zmm19
+vmovupd           %zmm16, (%rax)
+vmovupd           %zmm16, %zmm19 {k1}
+vmovupd           (%rax), %zmm19 {k1}
+vmovupd           %zmm16, (%rax) {k1}
+vmovupd           %zmm16, %zmm19 {z}{k1}
+vmovupd           (%rax), %zmm19 {z}{k1}
+
+vmovups           %zmm16, %zmm19
+vmovups           (%rax), %zmm19
+vmovups           %zmm16, (%rax)
+vmovups           %zmm16, %zmm19 {k1}
+vmovups           (%rax), %zmm19 {k1}
+vmovups           %zmm16, (%rax) {k1}
+vmovups           %zmm16, %zmm19 {z}{k1}
+vmovups           (%rax), %zmm19 {z}{k1}
+
 vmulpd            %zmm16, %zmm17, %zmm19
 vmulpd            (%rax), %zmm17, %zmm19
 vmulpd            (%rax){1to8}, %zmm17, %zmm19
@@ -996,12 +1068,60 @@
 # CHECK-NEXT:  1      4     0.50                        vminps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      11    0.50    *                   vminps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      11    0.50    *                   vminps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqa64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19 {%k1}
@@ -1014,6 +1134,22 @@
 # CHECK-NEXT:  2      8     0.50    *                   vmovsldup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovsldup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovsldup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovups	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovups	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovups	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax), %zmm17, %zmm19
 # CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19
@@ -1533,7 +1669,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     612.00 220.67 41.67  261.50 261.50  -     539.67 2.00    -
+# CHECK-NEXT:  -     612.00 240.67 49.67  278.83 278.83 16.00  559.67 2.00   5.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1721,12 +1857,60 @@
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vminps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vminps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vminps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovapd	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovapd	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovapd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovapd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovapd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovaps	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovaps	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovaps	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovaps	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovaps	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqa32	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa32	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqa32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqa32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqa64	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa64	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqa64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqa64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu32	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu32	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu32	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu32	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu32	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu64	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu64	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu64	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu64	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu64	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovshdup	%zmm16, %zmm19 {%k1}
@@ -1739,6 +1923,22 @@
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovsldup	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovsldup	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovsldup	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovupd	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovupd	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovupd	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovupd	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovupd	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovups	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovups	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovups	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovups	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovups	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmulpd	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vmulpd	(%rax), %zmm17, %zmm19
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512bw.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512bw.s
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512bw.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512bw.s
@@ -20,6 +20,24 @@
 kunpckdq          %k0, %k1, %k2
 kunpckwd          %k0, %k1, %k2
 
+vmovdqu8          %zmm16, %zmm19
+vmovdqu8          (%rax), %zmm19
+vmovdqu8          %zmm16, (%rax)
+vmovdqu8          %zmm16, %zmm19 {k1}
+vmovdqu8          (%rax), %zmm19 {k1}
+vmovdqu8          %zmm16, (%rax) {k1}
+vmovdqu8          %zmm16, %zmm19 {z}{k1}
+vmovdqu8          (%rax), %zmm19 {z}{k1}
+
+vmovdqu16         %zmm16, %zmm19
+vmovdqu16         (%rax), %zmm19
+vmovdqu16         %zmm16, (%rax)
+vmovdqu16         %zmm16, %zmm19 {k1}
+vmovdqu16         (%rax), %zmm19 {k1}
+vmovdqu16         %zmm16, (%rax) {k1}
+vmovdqu16         %zmm16, %zmm19 {z}{k1}
+vmovdqu16         (%rax), %zmm19 {z}{k1}
+
 vpabsb            %zmm16, %zmm19
 vpabsb            (%rax), %zmm19
 vpabsb            %zmm16, %zmm19 {k1}
@@ -280,6 +298,22 @@
 # CHECK-NEXT:  1      4     1.00                        kshiftrq	$2, %k1, %k2
 # CHECK-NEXT:  1      4     1.00                        kunpckdq	%k0, %k1, %k2
 # CHECK-NEXT:  1      4     1.00                        kunpckwd	%k0, %k1, %k2
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %zmm19
+# CHECK-NEXT:  5      2     2.00           *            vmovdqu8	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  5      2     2.00           *            vmovdqu8	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu8	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %zmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%zmm16, (%rax)
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.50                        vmovdqu16	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vpabsb	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vpabsb	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vpabsb	%zmm16, %zmm19 {%k1}
@@ -470,8 +504,8 @@
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpunpcklwd	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.25                        vpmovm2b	%k0, %zmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2w	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2b	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2w	%k0, %zmm0
 # CHECK-NEXT:  1      1     1.00                        vpmovb2m	%zmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovw2m	%zmm0, %k0
 
@@ -489,7 +523,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     32.50  8.50   47.50  47.50   -     176.50 0.50    -
+# CHECK-NEXT:  -      -     38.00  11.00  52.50  52.50  6.00   183.00  -     2.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -511,6 +545,22 @@
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     kshiftrq	$2, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     kunpckdq	%k0, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     kunpckwd	%k0, %k1, %k2
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu8	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -     0.50   0.67   0.67   2.00   0.50    -     0.67   vmovdqu8	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu8	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -     0.50   0.67   0.67   2.00   0.50    -     0.67   vmovdqu8	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu8	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu16	%zmm16, %zmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %zmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu16	%zmm16, (%rax)
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu16	%zmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu16	%zmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu16	%zmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpabsb	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vpabsb	(%rax), %zmm19
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpabsb	%zmm16, %zmm19 {%k1}
@@ -701,7 +751,7 @@
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vpunpcklwd	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     vpunpcklwd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2b	%k0, %zmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2w	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vpmovm2b	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vpmovm2w	%k0, %zmm0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovb2m	%zmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovw2m	%zmm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512bwvl.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512bwvl.s
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512bwvl.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512bwvl.s
@@ -1,6 +1,42 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -instruction-tables < %s | FileCheck %s
 
+vmovdqu8          %xmm16, %xmm19
+vmovdqu8          (%rax), %xmm19
+vmovdqu8          %xmm16, (%rax)
+vmovdqu8          %xmm16, %xmm19 {k1}
+vmovdqu8          (%rax), %xmm19 {k1}
+vmovdqu8          %xmm16, (%rax) {k1}
+vmovdqu8          %xmm16, %xmm19 {z}{k1}
+vmovdqu8          (%rax), %xmm19 {z}{k1}
+
+vmovdqu8          %ymm16, %ymm19
+vmovdqu8          (%rax), %ymm19
+vmovdqu8          %ymm16, (%rax)
+vmovdqu8          %ymm16, %ymm19 {k1}
+vmovdqu8          (%rax), %ymm19 {k1}
+vmovdqu8          %ymm16, (%rax) {k1}
+vmovdqu8          %ymm16, %ymm19 {z}{k1}
+vmovdqu8          (%rax), %ymm19 {z}{k1}
+
+vmovdqu16         %xmm16, %xmm19
+vmovdqu16         (%rax), %xmm19
+vmovdqu16         %xmm16, (%rax)
+vmovdqu16         %xmm16, %xmm19 {k1}
+vmovdqu16         (%rax), %xmm19 {k1}
+vmovdqu16         %xmm16, (%rax) {k1}
+vmovdqu16         %xmm16, %xmm19 {z}{k1}
+vmovdqu16         (%rax), %xmm19 {z}{k1}
+
+vmovdqu16         %ymm16, %ymm19
+vmovdqu16         (%rax), %ymm19
+vmovdqu16         %ymm16, (%rax)
+vmovdqu16         %ymm16, %ymm19 {k1}
+vmovdqu16         (%rax), %ymm19 {k1}
+vmovdqu16         %ymm16, (%rax) {k1}
+vmovdqu16         %ymm16, %ymm19 {z}{k1}
+vmovdqu16         (%rax), %ymm19 {z}{k1}
+
 vpabsb            %xmm16, %xmm19
 vpabsb            (%rax), %xmm19
 vpabsb            %xmm16, %xmm19 {k1}
@@ -476,6 +512,38 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu8	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu8	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu8	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu8	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu8	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu8	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu8	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu8	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu8	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu16	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu16	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu16	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu16	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu16	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu16	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     0.50    *                   vpabsb	(%rax), %xmm19
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%xmm16, %xmm19 {%k1}
@@ -856,10 +924,10 @@
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpunpcklwd	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.25                        vpmovm2b	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2w	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2b	%k0, %ymm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2w	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2b	%k0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2w	%k0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2b	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2w	%k0, %ymm0
 # CHECK-NEXT:  1      1     1.00                        vpmovb2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovw2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovb2m	%ymm0, %k0
@@ -879,10 +947,42 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     33.00  29.00  95.00  95.00   -     337.00 1.00    -
+# CHECK-NEXT:  -      -     41.33  37.33  103.67 103.67 8.00   345.33  -     2.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu8	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu8	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu8	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu8	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu8	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu8	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu8	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu8	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu8	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu8	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu8	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu16	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu16	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu16	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu16	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu16	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu16	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu16	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu16	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu16	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu16	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu16	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vpabsb	%xmm16, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vpabsb	(%rax), %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vpabsb	%xmm16, %xmm19 {%k1}
@@ -1263,10 +1363,10 @@
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vpunpcklwd	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     vpunpcklwd	(%rax), %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2b	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2w	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2b	%k0, %ymm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2w	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vpmovm2b	%k0, %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vpmovm2w	%k0, %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vpmovm2b	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vpmovm2w	%k0, %ymm0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovb2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovw2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovb2m	%ymm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512dq.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512dq.s
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512dq.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512dq.s
@@ -336,8 +336,8 @@
 # CHECK-NEXT:  1      1     0.50                        vxorps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.25                        vpmovm2d	%k0, %zmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2q	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2d	%k0, %zmm0
+# CHECK-NEXT:  1      1     0.50                        vpmovm2q	%k0, %zmm0
 # CHECK-NEXT:  1      1     1.00                        vpmovd2m	%zmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovq2m	%zmm0, %k0
 
@@ -355,7 +355,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     65.50  10.50  46.50  46.50   -     95.50  0.50    -
+# CHECK-NEXT:  -      -     66.00  10.00  46.50  46.50   -     96.00   -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -508,7 +508,7 @@
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vxorps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vxorps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vxorps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2d	%k0, %zmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2q	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vpmovm2d	%k0, %zmm0
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vpmovm2q	%k0, %zmm0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovd2m	%zmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovq2m	%zmm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512dqvl.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512dqvl.s
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512dqvl.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512dqvl.s
@@ -457,10 +457,10 @@
 # CHECK-NEXT:  1      1     0.33                        vxorps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vxorps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      1     0.25                        vpmovm2d	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2q	%k0, %xmm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2d	%k0, %ymm0
-# CHECK-NEXT:  1      1     0.25                        vpmovm2q	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2d	%k0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2q	%k0, %xmm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2d	%k0, %ymm0
+# CHECK-NEXT:  1      1     0.33                        vpmovm2q	%k0, %ymm0
 # CHECK-NEXT:  1      1     1.00                        vpmovd2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovq2m	%xmm0, %k0
 # CHECK-NEXT:  1      1     1.00                        vpmovd2m	%ymm0, %k0
@@ -480,7 +480,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     77.00  73.00  66.50  66.50   -     85.00  1.00    -
+# CHECK-NEXT:  -      -     77.33  73.33  66.50  66.50   -     85.33   -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -688,10 +688,10 @@
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vxorps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vxorps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vxorps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2d	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2q	%k0, %xmm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2d	%k0, %ymm0
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     0.25   0.25    -     vpmovm2q	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vpmovm2d	%k0, %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vpmovm2q	%k0, %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vpmovm2d	%k0, %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vpmovm2q	%k0, %ymm0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovd2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovq2m	%xmm0, %k0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vpmovd2m	%ymm0, %k0
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
@@ -308,6 +308,42 @@
 vminps            (%rax), %ymm17, %ymm19 {z}{k1}
 vminps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
+vmovapd           %xmm16, %xmm19
+vmovapd           (%rax), %xmm19
+vmovapd           %xmm16, (%rax)
+vmovapd           %xmm16, %xmm19 {k1}
+vmovapd           (%rax), %xmm19 {k1}
+vmovapd           %xmm16, (%rax) {k1}
+vmovapd           %xmm16, %xmm19 {z}{k1}
+vmovapd           (%rax), %xmm19 {z}{k1}
+
+vmovapd           %ymm16, %ymm19
+vmovapd           (%rax), %ymm19
+vmovapd           %ymm16, (%rax)
+vmovapd           %ymm16, %ymm19 {k1}
+vmovapd           (%rax), %ymm19 {k1}
+vmovapd           %ymm16, (%rax) {k1}
+vmovapd           %ymm16, %ymm19 {z}{k1}
+vmovapd           (%rax), %ymm19 {z}{k1}
+
+vmovaps           %xmm16, %xmm19
+vmovaps           (%rax), %xmm19
+vmovaps           %xmm16, (%rax)
+vmovaps           %xmm16, %xmm19 {k1}
+vmovaps           (%rax), %xmm19 {k1}
+vmovaps           %xmm16, (%rax) {k1}
+vmovaps           %xmm16, %xmm19 {z}{k1}
+vmovaps           (%rax), %xmm19 {z}{k1}
+
+vmovaps           %ymm16, %ymm19
+vmovaps           (%rax), %ymm19
+vmovaps           %ymm16, (%rax)
+vmovaps           %ymm16, %ymm19 {k1}
+vmovaps           (%rax), %ymm19 {k1}
+vmovaps           %ymm16, (%rax) {k1}
+vmovaps           %ymm16, %ymm19 {z}{k1}
+vmovaps           (%rax), %ymm19 {z}{k1}
+
 vmovddup          %xmm16, %xmm19
 vmovddup          (%rax), %xmm19
 vmovddup          %xmm16, %xmm19 {k1}
@@ -315,6 +351,78 @@
 vmovddup          %xmm16, %xmm19 {z}{k1}
 vmovddup          (%rax), %xmm19 {z}{k1}
 
+vmovdqa32         %xmm16, %xmm19
+vmovdqa32         (%rax), %xmm19
+vmovdqa32         %xmm16, (%rax)
+vmovdqa32         %xmm16, %xmm19 {k1}
+vmovdqa32         (%rax), %xmm19 {k1}
+vmovdqa32         %xmm16, (%rax) {k1}
+vmovdqa32         %xmm16, %xmm19 {z}{k1}
+vmovdqa32         (%rax), %xmm19 {z}{k1}
+
+vmovdqa32         %ymm16, %ymm19
+vmovdqa32         (%rax), %ymm19
+vmovdqa32         %ymm16, (%rax)
+vmovdqa32         %ymm16, %ymm19 {k1}
+vmovdqa32         (%rax), %ymm19 {k1}
+vmovdqa32         %ymm16, (%rax) {k1}
+vmovdqa32         %ymm16, %ymm19 {z}{k1}
+vmovdqa32         (%rax), %ymm19 {z}{k1}
+
+vmovdqa64         %xmm16, %xmm19
+vmovdqa64         (%rax), %xmm19
+vmovdqa64         %xmm16, (%rax)
+vmovdqa64         %xmm16, %xmm19 {k1}
+vmovdqa64         (%rax), %xmm19 {k1}
+vmovdqa64         %xmm16, (%rax) {k1}
+vmovdqa64         %xmm16, %xmm19 {z}{k1}
+vmovdqa64         (%rax), %xmm19 {z}{k1}
+
+vmovdqa64         %ymm16, %ymm19
+vmovdqa64         (%rax), %ymm19
+vmovdqa64         %ymm16, (%rax)
+vmovdqa64         %ymm16, %ymm19 {k1}
+vmovdqa64         (%rax), %ymm19 {k1}
+vmovdqa64         %ymm16, (%rax) {k1}
+vmovdqa64         %ymm16, %ymm19 {z}{k1}
+vmovdqa64         (%rax), %ymm19 {z}{k1}
+
+vmovdqu32         %xmm16, %xmm19
+vmovdqu32         (%rax), %xmm19
+vmovdqu32         %xmm16, (%rax)
+vmovdqu32         %xmm16, %xmm19 {k1}
+vmovdqu32         (%rax), %xmm19 {k1}
+vmovdqu32         %xmm16, (%rax) {k1}
+vmovdqu32         %xmm16, %xmm19 {z}{k1}
+vmovdqu32         (%rax), %xmm19 {z}{k1}
+
+vmovdqu32         %ymm16, %ymm19
+vmovdqu32         (%rax), %ymm19
+vmovdqu32         %ymm16, (%rax)
+vmovdqu32         %ymm16, %ymm19 {k1}
+vmovdqu32         (%rax), %ymm19 {k1}
+vmovdqu32         %ymm16, (%rax) {k1}
+vmovdqu32         %ymm16, %ymm19 {z}{k1}
+vmovdqu32         (%rax), %ymm19 {z}{k1}
+
+vmovdqu64         %xmm16, %xmm19
+vmovdqu64         (%rax), %xmm19
+vmovdqu64         %xmm16, (%rax)
+vmovdqu64         %xmm16, %xmm19 {k1}
+vmovdqu64         (%rax), %xmm19 {k1}
+vmovdqu64         %xmm16, (%rax) {k1}
+vmovdqu64         %xmm16, %xmm19 {z}{k1}
+vmovdqu64         (%rax), %xmm19 {z}{k1}
+
+vmovdqu64         %ymm16, %ymm19
+vmovdqu64         (%rax), %ymm19
+vmovdqu64         %ymm16, (%rax)
+vmovdqu64         %ymm16, %ymm19 {k1}
+vmovdqu64         (%rax), %ymm19 {k1}
+vmovdqu64         %ymm16, (%rax) {k1}
+vmovdqu64         %ymm16, %ymm19 {z}{k1}
+vmovdqu64         (%rax), %ymm19 {z}{k1}
+
 vmovddup          %ymm16, %ymm19
 vmovddup          (%rax), %ymm19
 vmovddup          %ymm16, %ymm19 {k1}
@@ -350,6 +458,42 @@
 vmovsldup         %ymm16, %ymm19 {z}{k1}
 vmovsldup         (%rax), %ymm19 {z}{k1}
 
+vmovupd           %xmm16, %xmm19
+vmovupd           (%rax), %xmm19
+vmovupd           %xmm16, (%rax)
+vmovupd           %xmm16, %xmm19 {k1}
+vmovupd           (%rax), %xmm19 {k1}
+vmovupd           %xmm16, (%rax) {k1}
+vmovupd           %xmm16, %xmm19 {z}{k1}
+vmovupd           (%rax), %xmm19 {z}{k1}
+
+vmovupd           %ymm16, %ymm19
+vmovupd           (%rax), %ymm19
+vmovupd           %ymm16, (%rax)
+vmovupd           %ymm16, %ymm19 {k1}
+vmovupd           (%rax), %ymm19 {k1}
+vmovupd           %ymm16, (%rax) {k1}
+vmovupd           %ymm16, %ymm19 {z}{k1}
+vmovupd           (%rax), %ymm19 {z}{k1}
+
+vmovups           %xmm16, %xmm19
+vmovups           (%rax), %xmm19
+vmovups           %xmm16, (%rax)
+vmovups           %xmm16, %xmm19 {k1}
+vmovups           (%rax), %xmm19 {k1}
+vmovups           %xmm16, (%rax) {k1}
+vmovups           %xmm16, %xmm19 {z}{k1}
+vmovups           (%rax), %xmm19 {z}{k1}
+
+vmovups           %ymm16, %ymm19
+vmovups           (%rax), %ymm19
+vmovups           %ymm16, (%rax)
+vmovups           %ymm16, %ymm19 {k1}
+vmovups           (%rax), %ymm19 {k1}
+vmovups           %ymm16, (%rax) {k1}
+vmovups           %ymm16, %ymm19 {z}{k1}
+vmovups           (%rax), %ymm19 {z}{k1}
+
 vmulpd            %xmm16, %xmm17, %xmm19
 vmulpd            (%rax), %xmm17, %xmm19
 vmulpd            (%rax){1to2}, %xmm17, %xmm19
@@ -1517,12 +1661,108 @@
 # CHECK-NEXT:  1      4     0.50                        vminps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      11    0.50    *                   vminps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      11    0.50    *                   vminps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovapd	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovapd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovapd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovapd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovapd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovapd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovaps	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovaps	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovaps	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovaps	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovaps	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovaps	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     0.50    *                   vmovddup	(%rax), %xmm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      7     0.50    *                   vmovddup	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  2      7     0.50    *                   vmovddup	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa32	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa64	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqa64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqa64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqa64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqa64	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu32	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu64	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovdqu64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovdqu64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %ymm19
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19 {%k1}
@@ -1553,6 +1793,38 @@
 # CHECK-NEXT:  2      8     0.50    *                   vmovsldup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovsldup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovsldup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovupd	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovupd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovupd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovupd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovupd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovupd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovups	%xmm16, %xmm19
+# CHECK-NEXT:  2      7     0.50    *                   vmovups	(%rax), %xmm19
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%xmm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovups	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  2      7     0.50    *                   vmovups	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovups	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   vmovups	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      1     0.33                        vmovups	%ymm16, %ymm19
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %ymm19
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%ymm16, (%rax)
+# CHECK-NEXT:  1      1     0.33                        vmovups	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  2      1     1.00           *            vmovups	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  1      1     0.33                        vmovups	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      4     0.50                        vmulpd	%xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  2      10    0.50    *                   vmulpd	(%rax), %xmm17, %xmm19
 # CHECK-NEXT:  2      10    0.50    *                   vmulpd	(%rax){1to2}, %xmm17, %xmm19
@@ -2350,7 +2622,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     423.00 257.33 169.33 372.50 372.50  -     715.33 4.00    -
+# CHECK-NEXT:  -     423.00 289.33 201.33 407.17 407.17 32.00  747.33 4.00   10.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2626,12 +2898,108 @@
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vminps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vminps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vminps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovapd	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovapd	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovapd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovapd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovapd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovapd	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovapd	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovapd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovapd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovapd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovapd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovaps	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovaps	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovaps	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovaps	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovaps	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovaps	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovaps	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovaps	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovaps	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovaps	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovaps	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%xmm16, %xmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %xmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa32	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa32	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa32	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa32	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa64	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa64	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa64	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa64	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqa64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqa64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqa64	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu32	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu32	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu32	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu32	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu32	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu32	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu32	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu32	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu32	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu32	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu32	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu64	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu64	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu64	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu64	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu64	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu64	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu64	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu64	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu64	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovdqu64	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%ymm16, %ymm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%ymm16, %ymm19 {%k1}
@@ -2662,6 +3030,38 @@
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovsldup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovsldup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovsldup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovupd	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovupd	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovupd	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovupd	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovupd	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovupd	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovupd	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovupd	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovupd	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovupd	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovupd	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovups	%xmm16, %xmm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %xmm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovups	%xmm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovups	%xmm16, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovups	%xmm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovups	%xmm16, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovups	%ymm16, %ymm19
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %ymm19
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovups	%ymm16, (%rax)
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovups	%ymm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovups	%ymm16, (%rax) {%k1}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -     vmovups	%ymm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovups	(%rax), %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vmulpd	%xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vmulpd	(%rax), %xmm17, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vmulpd	(%rax){1to2}, %xmm17, %xmm19
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -476,7 +476,7 @@
       return;
     }
 
-    char *AllocBase = 0;
+    char *AllocBase = nullptr;
     {
       std::lock_guard<std::mutex> Lock(SlabMutex);
 
@@ -1883,7 +1883,7 @@
   if (ShowInitialExecutionSessionState)
     S->ES.dump(outs());
 
-  JITEvaluatedSymbol EntryPoint = 0;
+  JITEvaluatedSymbol EntryPoint = nullptr;
   {
     TimeRegion TR(Timers ? &Timers->LinkTimer : nullptr);
     // Find the entry-point function unconditionally, since we want to force
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -917,10 +917,10 @@
           else {
             SymbolRef Symbol = *O->getSymbolByIndex(r_symbolnum);
             Expected<StringRef> SymNameNext = Symbol.getName();
-            const char *name = NULL;
+            const char *name = nullptr;
             if (SymNameNext)
               name = SymNameNext->data();
-            if (name == NULL)
+            if (name == nullptr)
               outs() << format("?(%d)\n", r_symbolnum);
             else
               outs() << name << "\n";
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -968,7 +968,7 @@
       "gen-cs-nested-profile", cl::Hidden, cl::init(false),
       cl::desc("Generate nested function profiles for CSSPGO"));
   cl::opt<std::string> DebugInfoFilename(
-      "debug-info", cl::init(""), cl::Hidden,
+      "debug-info", cl::init(""),
       cl::desc("Use the provided debug info to correlate the raw profile."));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
diff --git a/llvm/tools/llvm-readobj/COFFDumper.cpp b/llvm/tools/llvm-readobj/COFFDumper.cpp
--- a/llvm/tools/llvm-readobj/COFFDumper.cpp
+++ b/llvm/tools/llvm-readobj/COFFDumper.cpp
@@ -126,7 +126,7 @@
   void printCOFFTLSDirectory(const coff_tls_directory<IntTy> *TlsTable);
   typedef void (*PrintExtraCB)(raw_ostream &, const uint8_t *);
   void printRVATable(uint64_t TableVA, uint64_t Count, uint64_t EntrySize,
-                     PrintExtraCB PrintExtra = 0);
+                     PrintExtraCB PrintExtra = nullptr);
 
   void printCodeViewSymbolSection(StringRef SectionName, const SectionRef &Section);
   void printCodeViewTypeSection(StringRef SectionName, const SectionRef &Section);
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceMetadata.cpp b/llvm/tools/llvm-reduce/deltas/ReduceMetadata.cpp
--- a/llvm/tools/llvm-reduce/deltas/ReduceMetadata.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceMetadata.cpp
@@ -31,7 +31,7 @@
 
   for (NamedMDNode *NN : NamedNodesToDelete) {
     for (auto I : seq<unsigned>(0, NN->getNumOperands()))
-      NN->setOperand(I, NULL);
+      NN->setOperand(I, nullptr);
     NN->eraseFromParent();
   }
 
@@ -41,7 +41,7 @@
     GV.getAllMetadata(MDs);
     for (std::pair<unsigned, MDNode *> &MD : MDs)
       if (!O.shouldKeep())
-        GV.setMetadata(MD.first, NULL);
+        GV.setMetadata(MD.first, nullptr);
   }
 
   for (Function &F : Program) {
@@ -51,7 +51,7 @@
       F.getAllMetadata(MDs);
       for (std::pair<unsigned, MDNode *> &MD : MDs)
         if (!O.shouldKeep())
-          F.setMetadata(MD.first, NULL);
+          F.setMetadata(MD.first, nullptr);
     }
 
     // Delete out-of-chunk metadata attached to instructions.
@@ -60,7 +60,7 @@
       I.getAllMetadata(MDs);
       for (std::pair<unsigned, MDNode *> &MD : MDs)
         if (!O.shouldKeep())
-          I.setMetadata(MD.first, NULL);
+          I.setMetadata(MD.first, nullptr);
     }
   }
 }
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -2000,11 +2000,11 @@
 
     AssumptionCache AC(*F);
     Value *Stride = &*F->arg_begin();
-    ConstantRange CR1 = computeConstantRange(Stride, true, &AC, nullptr);
+    ConstantRange CR1 = computeConstantRange(Stride, false, true, &AC, nullptr);
     EXPECT_TRUE(CR1.isFullSet());
 
     Instruction *I = &findInstructionByName(F, "stride.plus.one");
-    ConstantRange CR2 = computeConstantRange(Stride, true, &AC, I);
+    ConstantRange CR2 = computeConstantRange(Stride, false, true, &AC, I);
     EXPECT_EQ(5, CR2.getLower());
     EXPECT_EQ(10, CR2.getUpper());
   }
@@ -2034,7 +2034,7 @@
     AssumptionCache AC(*F);
     Value *Stride = &*F->arg_begin();
     Instruction *I = &findInstructionByName(F, "stride.plus.one");
-    ConstantRange CR = computeConstantRange(Stride, true, &AC, I);
+    ConstantRange CR = computeConstantRange(Stride, false, true, &AC, I);
     EXPECT_EQ(99, *CR.getSingleElement());
   }
 
@@ -2072,12 +2072,12 @@
     AssumptionCache AC(*F);
     Value *Stride = &*F->arg_begin();
     Instruction *GT2 = &findInstructionByName(F, "gt.2");
-    ConstantRange CR = computeConstantRange(Stride, true, &AC, GT2);
+    ConstantRange CR = computeConstantRange(Stride, false, true, &AC, GT2);
     EXPECT_EQ(5, CR.getLower());
     EXPECT_EQ(0, CR.getUpper());
 
     Instruction *I = &findInstructionByName(F, "stride.plus.one");
-    ConstantRange CR2 = computeConstantRange(Stride, true, &AC, I);
+    ConstantRange CR2 = computeConstantRange(Stride, false, true, &AC, I);
     EXPECT_EQ(50, CR2.getLower());
     EXPECT_EQ(100, CR2.getUpper());
   }
@@ -2105,7 +2105,7 @@
     Value *Stride = &*F->arg_begin();
 
     Instruction *I = &findInstructionByName(F, "stride.plus.one");
-    ConstantRange CR = computeConstantRange(Stride, true, &AC, I);
+    ConstantRange CR = computeConstantRange(Stride, false, true, &AC, I);
     EXPECT_TRUE(CR.isEmptySet());
   }
 
@@ -2133,8 +2133,8 @@
     Value *X2 = &*std::next(F->arg_begin());
 
     Instruction *I = &findInstructionByName(F, "stride.plus.one");
-    ConstantRange CR1 = computeConstantRange(X1, true, &AC, I);
-    ConstantRange CR2 = computeConstantRange(X2, true, &AC, I);
+    ConstantRange CR1 = computeConstantRange(X1, false, true, &AC, I);
+    ConstantRange CR2 = computeConstantRange(X2, false, true, &AC, I);
 
     EXPECT_EQ(5, CR1.getLower());
     EXPECT_EQ(0, CR1.getUpper());
@@ -2144,7 +2144,7 @@
 
     // Check the depth cutoff results in a conservative result (full set) by
     // passing Depth == MaxDepth == 6.
-    ConstantRange CR3 = computeConstantRange(X2, true, &AC, I, nullptr, 6);
+    ConstantRange CR3 = computeConstantRange(X2, false, true, &AC, I, nullptr, 6);
     EXPECT_TRUE(CR3.isFullSet());
   }
   {
@@ -2165,7 +2165,7 @@
     Value *X2 = &*std::next(F->arg_begin());
 
     Instruction *I = &findInstructionByName(F, "stride.plus.one");
-    ConstantRange CR1 = computeConstantRange(X2, true, &AC, I);
+    ConstantRange CR1 = computeConstantRange(X2, false, true, &AC, I);
     // If we don't know the value of x.2, we don't know the value of x.1.
     EXPECT_TRUE(CR1.isFullSet());
   }
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -303,8 +303,7 @@
   EXPECT_EQ(Barrier->getNumUses(), 0U);
   EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(),
             1U);
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0),
-            CBB);
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), CBB);
 
   EXPECT_EQ(cast<CallInst>(Cancel)->getArgOperand(1), GTID);
 
@@ -342,7 +341,6 @@
   BB = BB->getTerminator()->getSuccessor(0);
   EXPECT_EQ(BB->size(), 4U);
 
-
   CallInst *GTID = dyn_cast<CallInst>(&BB->front());
   EXPECT_NE(GTID, nullptr);
   EXPECT_EQ(GTID->arg_size(), 1U);
@@ -360,7 +358,8 @@
   Instruction *CancelBBTI = Cancel->getParent()->getTerminator();
   EXPECT_EQ(CancelBBTI->getNumSuccessors(), 2U);
   EXPECT_EQ(CancelBBTI->getSuccessor(0)->size(), 1U);
-  EXPECT_EQ(CancelBBTI->getSuccessor(0)->getUniqueSuccessor(), NewIP.getBlock());
+  EXPECT_EQ(CancelBBTI->getSuccessor(0)->getUniqueSuccessor(),
+            NewIP.getBlock());
   EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 3U);
   CallInst *GTID1 = dyn_cast<CallInst>(&CancelBBTI->getSuccessor(1)->front());
   EXPECT_NE(GTID1, nullptr);
@@ -377,8 +376,7 @@
   EXPECT_EQ(Barrier->getNumUses(), 0U);
   EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(),
             1U);
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0),
-            CBB);
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), CBB);
 
   EXPECT_EQ(cast<CallInst>(Cancel)->getArgOperand(1), GTID);
 
@@ -502,8 +500,8 @@
     Builder.CreateStore(F->arg_begin(), PrivAI);
 
     Builder.restoreIP(CodeGenIP);
-    Value *PrivLoad = Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI,
-                                         "local.use");
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
     Value *Cmp = Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
     Instruction *ThenTerm, *ElseTerm;
     SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(),
@@ -809,8 +807,8 @@
     Builder.CreateStore(F->arg_begin(), PrivAI);
 
     Builder.restoreIP(CodeGenIP);
-    Value *PrivLoad = Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI,
-                                         "local.use");
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
     Value *Cmp = Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
     Instruction *ThenTerm, *ElseTerm;
     SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(),
@@ -2011,8 +2009,8 @@
     EntryBB = ThenBB->getUniquePredecessor();
 
     // simple instructions for body
-    Value *PrivLoad = Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI,
-                                         "local.use");
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
     Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
   };
 
@@ -2164,8 +2162,8 @@
     // body begin
     Builder.restoreIP(CodeGenIP);
     Builder.CreateStore(F->arg_begin(), PrivAI);
-    Value *PrivLoad = Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI,
-                                         "local.use");
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
     Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
   };
 
@@ -2538,32 +2536,33 @@
 
   OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
 
-  IntegerType* Int32 = Type::getInt32Ty(M->getContext());
-  AllocaInst* MasterAddress = Builder.CreateAlloca(Int32->getPointerTo());
-	AllocaInst* PrivAddress = Builder.CreateAlloca(Int32->getPointerTo());
+  IntegerType *Int32 = Type::getInt32Ty(M->getContext());
+  AllocaInst *MasterAddress = Builder.CreateAlloca(Int32->getPointerTo());
+  AllocaInst *PrivAddress = Builder.CreateAlloca(Int32->getPointerTo());
 
   BasicBlock *EntryBB = BB;
 
   OMPBuilder.createCopyinClauseBlocks(Builder.saveIP(), MasterAddress,
                                       PrivAddress, Int32, /*BranchtoEnd*/ true);
 
-  BranchInst* EntryBr = dyn_cast_or_null<BranchInst>(EntryBB->getTerminator());
+  BranchInst *EntryBr = dyn_cast_or_null<BranchInst>(EntryBB->getTerminator());
 
   EXPECT_NE(EntryBr, nullptr);
   EXPECT_TRUE(EntryBr->isConditional());
 
-  BasicBlock* NotMasterBB = EntryBr->getSuccessor(0);
-  BasicBlock* CopyinEnd = EntryBr->getSuccessor(1);
-  CmpInst* CMP = dyn_cast_or_null<CmpInst>(EntryBr->getCondition());
+  BasicBlock *NotMasterBB = EntryBr->getSuccessor(0);
+  BasicBlock *CopyinEnd = EntryBr->getSuccessor(1);
+  CmpInst *CMP = dyn_cast_or_null<CmpInst>(EntryBr->getCondition());
 
   EXPECT_NE(CMP, nullptr);
   EXPECT_NE(NotMasterBB, nullptr);
   EXPECT_NE(CopyinEnd, nullptr);
 
-  BranchInst* NotMasterBr = dyn_cast_or_null<BranchInst>(NotMasterBB->getTerminator());
+  BranchInst *NotMasterBr =
+      dyn_cast_or_null<BranchInst>(NotMasterBB->getTerminator());
   EXPECT_NE(NotMasterBr, nullptr);
   EXPECT_FALSE(NotMasterBr->isConditional());
-  EXPECT_EQ(CopyinEnd,NotMasterBr->getSuccessor(0));
+  EXPECT_EQ(CopyinEnd, NotMasterBr->getSuccessor(0));
 }
 
 TEST_F(OpenMPIRBuilderTest, SingleDirective) {
@@ -2602,8 +2601,8 @@
     EntryBB = ThenBB->getUniquePredecessor();
 
     // simple instructions for body
-    Value *PrivLoad = Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI,
-                                         "local.use");
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
     Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
   };
 
@@ -3106,8 +3105,9 @@
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
-    Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc);
-    Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr);
+    uint32_t StrSize;
+    Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, StrSize);
+    Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, StrSize);
     Value *TID = OMPBuilder.getOrCreateThreadID(Ident);
     Value *SumLocal =
         Builder.CreateUIToFP(TID, Builder.getFloatTy(), "sum.local");
@@ -3339,8 +3339,9 @@
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
-    Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc);
-    Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr);
+    uint32_t StrSize;
+    Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, StrSize);
+    Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, StrSize);
     Value *TID = OMPBuilder.getOrCreateThreadID(Ident);
     Value *SumLocal =
         Builder.CreateUIToFP(TID, Builder.getFloatTy(), "sum.local");
@@ -3359,8 +3360,9 @@
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
-    Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc);
-    Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr);
+    uint32_t StrSize;
+    Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, StrSize);
+    Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, StrSize);
     Value *TID = OMPBuilder.getOrCreateThreadID(Ident);
     Value *XorPartial = Builder.CreateLoad(XorType, XorReduced, "xor.partial");
     Value *Xor = Builder.CreateXor(XorPartial, TID, "xor");
@@ -3695,8 +3697,11 @@
 
   IRBuilder<> Builder(BB);
 
-  Constant *Cst1 = OMPBuilder.getOrCreateSrcLocStr("array1", "file1", 2, 5);
-  Constant *Cst2 = OMPBuilder.getOrCreateSrcLocStr("array2", "file1", 3, 5);
+  uint32_t StrSize;
+  Constant *Cst1 =
+      OMPBuilder.getOrCreateSrcLocStr("array1", "file1", 2, 5, StrSize);
+  Constant *Cst2 =
+      OMPBuilder.getOrCreateSrcLocStr("array2", "file1", 3, 5, StrSize);
   SmallVector<llvm::Constant *> Names = {Cst1, Cst2};
 
   GlobalVariable *OffloadMaptypesGlobal =
@@ -3798,11 +3803,15 @@
 
   SmallVector<uint64_t> Flags = {0, 2};
 
-  Constant *SrcLocCst = OMPBuilder.getOrCreateSrcLocStr("", "file1", 2, 5);
-  Value *SrcLocInfo = OMPBuilder.getOrCreateIdent(SrcLocCst);
+  uint32_t StrSize;
+  Constant *SrcLocCst =
+      OMPBuilder.getOrCreateSrcLocStr("", "file1", 2, 5, StrSize);
+  Value *SrcLocInfo = OMPBuilder.getOrCreateIdent(SrcLocCst, StrSize);
 
-  Constant *Cst1 = OMPBuilder.getOrCreateSrcLocStr("array1", "file1", 2, 5);
-  Constant *Cst2 = OMPBuilder.getOrCreateSrcLocStr("array2", "file1", 3, 5);
+  Constant *Cst1 =
+      OMPBuilder.getOrCreateSrcLocStr("array1", "file1", 2, 5, StrSize);
+  Constant *Cst2 =
+      OMPBuilder.getOrCreateSrcLocStr("array2", "file1", 3, 5, StrSize);
   SmallVector<llvm::Constant *> Names = {Cst1, Cst2};
 
   GlobalVariable *Maptypes =
diff --git a/llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.h b/llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.h
--- a/llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.h
+++ b/llvm/utils/TableGen/GlobalISel/GIMatchDagInstr.h
@@ -61,7 +61,7 @@
   /// For debugging purposes, it's helpful to have access to a description of
   /// the Opcode. However, this object shouldn't use it for more than debugging
   /// output since predicates are expected to be handled outside the DAG.
-  CodeGenInstruction *OpcodeAnnotation = 0;
+  CodeGenInstruction *OpcodeAnnotation = nullptr;
 
   /// When true, this instruction will be a starting point for a match attempt.
   bool IsMatchRoot = false;
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
@@ -5,5 +5,8 @@
     "//clang/lib/AST",
     "//clang/lib/Analysis",
   ]
-  sources = [ "TypeErasedDataflowAnalysis.cpp" ]
+  sources = [
+    "ControlFlowContext.cpp",
+    "TypeErasedDataflowAnalysis.cpp",
+  ]
 }
diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h
--- a/mlir/include/mlir/Analysis/AffineStructures.h
+++ b/mlir/include/mlir/Analysis/AffineStructures.h
@@ -335,15 +335,7 @@
   /// match.
   void mergeLocalIds(FlatAffineConstraints &other);
 
-  void print(raw_ostream &os) const;
-  void dump() const;
-
 protected:
-  /// Returns false if the fields corresponding to various identifier counts, or
-  /// equality/inequality buffer sizes aren't consistent; true otherwise. This
-  /// is meant to be used within an assert internally.
-  virtual bool hasConsistentState() const;
-
   /// Checks all rows of equality/inequality constraints for trivial
   /// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced
   /// after elimination. Returns true if an invalid constraint is found;
@@ -419,6 +411,11 @@
   /// equalities.
   bool isColZero(unsigned pos) const;
 
+  /// Prints the number of constraints, dimensions, symbols and locals in the
+  /// FlatAffineConstraints. Also, prints for each identifier whether there is
+  /// an SSA Value attached to it.
+  void printSpace(raw_ostream &os) const override;
+
   /// A parameter that controls detection of an unrealistic number of
   /// constraints. If the number of constraints is this many times the number of
   /// variables, we consider such a system out of line with the intended use
diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
--- a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
@@ -195,7 +195,19 @@
                                SmallVectorImpl<unsigned> *eqIndices = nullptr,
                                unsigned offset = 0, unsigned num = 0) const;
 
+  void print(raw_ostream &os) const;
+  void dump() const;
+
 protected:
+  /// Returns false if the fields corresponding to various identifier counts, or
+  /// equality/inequality buffer sizes aren't consistent; true otherwise. This
+  /// is meant to be used within an assert internally.
+  virtual bool hasConsistentState() const;
+
+  /// Prints the number of constraints, dimensions, symbols and locals in the
+  /// IntegerPolyhedron.
+  virtual void printSpace(raw_ostream &os) const;
+
   /// Return the index at which the specified kind of id starts.
   unsigned getIdKindOffset(IdKind kind) const;
 
diff --git a/mlir/include/mlir/Analysis/Presburger/Simplex.h b/mlir/include/mlir/Analysis/Presburger/Simplex.h
--- a/mlir/include/mlir/Analysis/Presburger/Simplex.h
+++ b/mlir/include/mlir/Analysis/Presburger/Simplex.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Functionality to perform analysis on FlatAffineConstraints. In particular,
+// Functionality to perform analysis on an IntegerPolyhedron. In particular,
 // support for performing emptiness checks and redundancy checks.
 //
 //===----------------------------------------------------------------------===//
@@ -14,8 +14,8 @@
 #ifndef MLIR_ANALYSIS_PRESBURGER_SIMPLEX_H
 #define MLIR_ANALYSIS_PRESBURGER_SIMPLEX_H
 
-#include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/IntegerPolyhedron.h"
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/IR/Location.h"
 #include "mlir/Support/LogicalResult.h"
@@ -39,7 +39,7 @@
 /// sets. Furthermore, it can find a subset of these constraints that are
 /// redundant, i.e. a subset of constraints that doesn't constrain the affine
 /// set further after adding the non-redundant constraints. Simplex can also be
-/// constructed from a FlatAffineConstraints object.
+/// constructed from an IntegerPolyhedron object.
 ///
 /// The implementation of the Simplex and SimplexBase classes, other than the
 /// functionality for sampling, is based on the paper
@@ -146,7 +146,7 @@
 
   SimplexBase() = delete;
   explicit SimplexBase(unsigned nVar);
-  explicit SimplexBase(const FlatAffineConstraints &constraints);
+  explicit SimplexBase(const IntegerPolyhedron &constraints);
 
   /// Returns true if the tableau is empty (has conflicting constraints),
   /// false otherwise.
@@ -180,8 +180,8 @@
   /// Rollback to a snapshot. This invalidates all later snapshots.
   void rollback(unsigned snapshot);
 
-  /// Add all the constraints from the given FlatAffineConstraints.
-  void intersectFlatAffineConstraints(const FlatAffineConstraints &fac);
+  /// Add all the constraints from the given IntegerPolyhedron.
+  void intersectIntegerPolyhedron(const IntegerPolyhedron &fac);
 
   /// Returns a rational sample point. This should not be called when Simplex is
   /// empty.
@@ -330,7 +330,7 @@
 public:
   Simplex() = delete;
   explicit Simplex(unsigned nVar) : SimplexBase(nVar) {}
-  explicit Simplex(const FlatAffineConstraints &constraints)
+  explicit Simplex(const IntegerPolyhedron &constraints)
       : SimplexBase(constraints) {}
 
   /// Compute the maximum or minimum value of the given row, depending on
@@ -389,7 +389,7 @@
 
   /// Returns true if this Simplex's polytope is a rational subset of `fac`.
   /// Otherwise, returns false.
-  bool isRationalSubsetOf(const FlatAffineConstraints &fac);
+  bool isRationalSubsetOf(const IntegerPolyhedron &fac);
 
 private:
   friend class GBRSimplex;
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -645,6 +645,20 @@
   let constructor = "tosa::createTosaToLinalg()";
 }
 
+//===----------------------------------------------------------------------===//
+// TosaToLinalgNamed
+//===----------------------------------------------------------------------===//
+
+def TosaToLinalgNamed : FunctionPass<"tosa-to-linalg-named"> {
+  let summary = "Lower TOSA to LinAlg named operations";
+  let description = [{
+    Pass that converts TOSA operations to the equivalent operations using the
+    Linalg named operations.
+  }];
+
+  let constructor = "tosa::createTosaToLinalgNamed()";
+}
+
 //===----------------------------------------------------------------------===//
 // TosaToSCF
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
--- a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
+++ b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
@@ -20,6 +20,7 @@
 namespace tosa {
 
 std::unique_ptr<Pass> createTosaToLinalg();
+std::unique_ptr<Pass> createTosaToLinalgNamed();
 
 /// Populates passes to convert from TOSA to Linalg on buffers. At the end of
 /// the pass, the function will only contain linalg ops or standard ops if the
@@ -29,6 +30,9 @@
 /// Populates conversion passes from TOSA dialect to Linalg dialect.
 void populateTosaToLinalgConversionPatterns(RewritePatternSet *patterns);
 
+/// Populates conversion passes from TOSA dialect to Linalg named operations.
+void populateTosaToLinalgNamedConversionPatterns(RewritePatternSet *patterns);
+
 } // namespace tosa
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td b/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td
--- a/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td
+++ b/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td
@@ -892,6 +892,7 @@
     rounded using the default rounding mode. When operating on vectors, casts
     elementwise.
   }];
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -906,6 +907,7 @@
     rounded using the default rounding mode. When operating on vectors, casts
     elementwise.
   }];
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -919,6 +921,7 @@
     towards zero) unsigned integer value. When operating on vectors, casts
     elementwise.
   }];
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -932,6 +935,7 @@
     towards zero) signed integer value. When operating on vectors, casts
     elementwise.
   }];
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1395,6 +1395,12 @@
   let arguments = (ins LLVM_Type:$a, LLVM_Type:$b);
 }
 
+class LLVM_BinaryIntrinsicOp<string func, list<OpTrait> traits = []> :
+    LLVM_OneResultIntrOp<func, [], [0,1],
+           !listconcat([NoSideEffect], traits)> {
+  let arguments = (ins LLVM_Type:$a, LLVM_Type:$b);
+}
+
 class LLVM_TernarySameArgsIntrinsicOp<string func, list<OpTrait> traits = []> :
     LLVM_OneResultIntrOp<func, [], [0],
            !listconcat([NoSideEffect, SameOperandsAndResultType], traits)> {
@@ -1426,6 +1432,7 @@
 def LLVM_SinOp : LLVM_UnaryIntrinsicOp<"sin">;
 def LLVM_SqrtOp : LLVM_UnaryIntrinsicOp<"sqrt">;
 def LLVM_PowOp : LLVM_BinarySameArgsIntrinsicOp<"pow">;
+def LLVM_PowIOp : LLVM_BinaryIntrinsicOp<"powi">;
 def LLVM_BitReverseOp : LLVM_UnaryIntrinsicOp<"bitreverse">;
 def LLVM_CountLeadingZerosOp : LLVM_CountZerosIntrinsicOp<"ctlz">;
 def LLVM_CountTrailingZerosOp : LLVM_CountZerosIntrinsicOp<"cttz">;
diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/StandardOpsBase.td b/mlir/include/mlir/Dialect/StandardOps/IR/StandardOpsBase.td
--- a/mlir/include/mlir/Dialect/StandardOps/IR/StandardOpsBase.td
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/StandardOpsBase.td
@@ -26,13 +26,16 @@
 def ATOMIC_RMW_KIND_MINU    : I64EnumAttrCase<"minu", 8>;
 def ATOMIC_RMW_KIND_MULF    : I64EnumAttrCase<"mulf", 9>;
 def ATOMIC_RMW_KIND_MULI    : I64EnumAttrCase<"muli", 10>;
+def ATOMIC_RMW_KIND_ORI     : I64EnumAttrCase<"ori", 11>;
+def ATOMIC_RMW_KIND_ANDI    : I64EnumAttrCase<"andi", 12>;
 
 def AtomicRMWKindAttr : I64EnumAttr<
     "AtomicRMWKind", "",
     [ATOMIC_RMW_KIND_ADDF, ATOMIC_RMW_KIND_ADDI, ATOMIC_RMW_KIND_ASSIGN,
      ATOMIC_RMW_KIND_MAXF, ATOMIC_RMW_KIND_MAXS, ATOMIC_RMW_KIND_MAXU,
      ATOMIC_RMW_KIND_MINF, ATOMIC_RMW_KIND_MINS, ATOMIC_RMW_KIND_MINU,
-     ATOMIC_RMW_KIND_MULF, ATOMIC_RMW_KIND_MULI]> {
+     ATOMIC_RMW_KIND_MULF, ATOMIC_RMW_KIND_MULI, ATOMIC_RMW_KIND_ORI,
+     ATOMIC_RMW_KIND_ANDI]> {
   let cppNamespace = "::mlir";
 }
 
diff --git a/mlir/include/mlir/Support/DebugAction.h b/mlir/include/mlir/Support/DebugAction.h
--- a/mlir/include/mlir/Support/DebugAction.h
+++ b/mlir/include/mlir/Support/DebugAction.h
@@ -205,8 +205,7 @@
 
     /// Provide classof to allow casting between handler types.
     static bool classof(const DebugActionManager::HandlerBase *handler) {
-      return handler->getHandlerID() ==
-             TypeID::get<DebugAction<ParameterTs...>::Handler>();
+      return handler->getHandlerID() == TypeID::get<Handler>();
     }
   };
 
diff --git a/mlir/include/mlir/Tools/PDLL/AST/Types.h b/mlir/include/mlir/Tools/PDLL/AST/Types.h
--- a/mlir/include/mlir/Tools/PDLL/AST/Types.h
+++ b/mlir/include/mlir/Tools/PDLL/AST/Types.h
@@ -51,7 +51,6 @@
   };
 
   Type(Storage *impl = nullptr) : impl(impl) {}
-  Type(const Type &other) = default;
 
   bool operator==(const Type &other) const { return impl == other.impl; }
   bool operator!=(const Type &other) const { return !(*this == other); }
diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp
--- a/mlir/lib/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -57,6 +57,8 @@
           .Case([](arith::AddFOp) { return AtomicRMWKind::addf; })
           .Case([](arith::MulFOp) { return AtomicRMWKind::mulf; })
           .Case([](arith::AddIOp) { return AtomicRMWKind::addi; })
+          .Case([](arith::AndIOp) { return AtomicRMWKind::andi; })
+          .Case([](arith::OrIOp) { return AtomicRMWKind::ori; })
           .Case([](arith::MulIOp) { return AtomicRMWKind::muli; })
           .Case([](arith::MinFOp) { return AtomicRMWKind::minf; })
           .Case([](arith::MaxFOp) { return AtomicRMWKind::maxf; })
diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp
--- a/mlir/lib/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Analysis/AffineStructures.cpp
@@ -747,19 +747,6 @@
   }
 }
 
-bool FlatAffineConstraints::hasConsistentState() const {
-  if (!inequalities.hasConsistentState())
-    return false;
-  if (!equalities.hasConsistentState())
-    return false;
-
-  // Catches errors where numDims, numSymbols, numIds aren't consistent.
-  if (numDims > numIds || numSymbols > numIds || numDims + numSymbols > numIds)
-    return false;
-
-  return true;
-}
-
 bool FlatAffineValueConstraints::hasConsistentState() const {
   return FlatAffineConstraints::hasConsistentState() &&
          values.size() == getNumIds();
@@ -2587,11 +2574,8 @@
   return true;
 }
 
-void FlatAffineConstraints::print(raw_ostream &os) const {
-  assert(hasConsistentState());
-  os << "\nConstraints (" << getNumDimIds() << " dims, " << getNumSymbolIds()
-     << " symbols, " << getNumLocalIds() << " locals), (" << getNumConstraints()
-     << " constraints)\n";
+void FlatAffineConstraints::printSpace(raw_ostream &os) const {
+  IntegerPolyhedron::printSpace(os);
   os << "(";
   for (unsigned i = 0, e = getNumIds(); i < e; i++) {
     if (auto *valueCstr = dyn_cast<const FlatAffineValueConstraints>(this)) {
@@ -2604,23 +2588,8 @@
     }
   }
   os << " const)\n";
-  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
-    for (unsigned j = 0, f = getNumCols(); j < f; ++j) {
-      os << atEq(i, j) << " ";
-    }
-    os << "= 0\n";
-  }
-  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
-    for (unsigned j = 0, f = getNumCols(); j < f; ++j) {
-      os << atIneq(i, j) << " ";
-    }
-    os << ">= 0\n";
-  }
-  os << '\n';
 }
 
-void FlatAffineConstraints::dump() const { print(llvm::errs()); }
-
 /// Removes duplicate constraints, trivially true constraints, and constraints
 /// that can be detected as redundant as a result of differing only in their
 /// constant term part. A constraint of the form <non-negative constant> >= 0 is
diff --git a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
--- a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
@@ -271,3 +271,42 @@
     eqIndices->push_back(r);
   }
 }
+
+bool IntegerPolyhedron::hasConsistentState() const {
+  if (!inequalities.hasConsistentState())
+    return false;
+  if (!equalities.hasConsistentState())
+    return false;
+
+  // Catches errors where numDims, numSymbols, numIds aren't consistent.
+  if (numDims > numIds || numSymbols > numIds || numDims + numSymbols > numIds)
+    return false;
+
+  return true;
+}
+
+void IntegerPolyhedron::printSpace(raw_ostream &os) const {
+  os << "\nConstraints (" << getNumDimIds() << " dims, " << getNumSymbolIds()
+     << " symbols, " << getNumLocalIds() << " locals), (" << getNumConstraints()
+     << " constraints)\n";
+}
+
+void IntegerPolyhedron::print(raw_ostream &os) const {
+  assert(hasConsistentState());
+  printSpace(os);
+  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+    for (unsigned j = 0, f = getNumCols(); j < f; ++j) {
+      os << atEq(i, j) << " ";
+    }
+    os << "= 0\n";
+  }
+  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+    for (unsigned j = 0, f = getNumCols(); j < f; ++j) {
+      os << atIneq(i, j) << " ";
+    }
+    os << ">= 0\n";
+  }
+  os << '\n';
+}
+
+void IntegerPolyhedron::dump() const { print(llvm::errs()); }
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -28,7 +28,7 @@
   }
 }
 
-SimplexBase::SimplexBase(const FlatAffineConstraints &constraints)
+SimplexBase::SimplexBase(const IntegerPolyhedron &constraints)
     : SimplexBase(constraints.getNumIds()) {
   for (unsigned i = 0, numIneqs = constraints.getNumInequalities();
        i < numIneqs; ++i)
@@ -502,15 +502,14 @@
   undoLog.insert(undoLog.end(), count, UndoLogEntry::RemoveLastVariable);
 }
 
-/// Add all the constraints from the given FlatAffineConstraints.
-void SimplexBase::intersectFlatAffineConstraints(
-    const FlatAffineConstraints &fac) {
-  assert(fac.getNumIds() == getNumVariables() &&
-         "FlatAffineConstraints must have same dimensionality as simplex");
-  for (unsigned i = 0, e = fac.getNumInequalities(); i < e; ++i)
-    addInequality(fac.getInequality(i));
-  for (unsigned i = 0, e = fac.getNumEqualities(); i < e; ++i)
-    addEquality(fac.getEquality(i));
+/// Add all the constraints from the given IntegerPolyhedron.
+void SimplexBase::intersectIntegerPolyhedron(const IntegerPolyhedron &poly) {
+  assert(poly.getNumIds() == getNumVariables() &&
+         "IntegerPolyhedron must have same dimensionality as simplex");
+  for (unsigned i = 0, e = poly.getNumInequalities(); i < e; ++i)
+    addInequality(poly.getInequality(i));
+  for (unsigned i = 0, e = poly.getNumEqualities(); i < e; ++i)
+    addEquality(poly.getEquality(i));
 }
 
 Optional<Fraction> Simplex::computeRowOptimum(Direction direction,
@@ -1285,16 +1284,16 @@
 
 void SimplexBase::dump() const { print(llvm::errs()); }
 
-bool Simplex::isRationalSubsetOf(const FlatAffineConstraints &fac) {
+bool Simplex::isRationalSubsetOf(const IntegerPolyhedron &poly) {
   if (isEmpty())
     return true;
 
-  for (unsigned i = 0, e = fac.getNumInequalities(); i < e; ++i)
-    if (!isRedundantInequality(fac.getInequality(i)))
+  for (unsigned i = 0, e = poly.getNumInequalities(); i < e; ++i)
+    if (!isRedundantInequality(poly.getInequality(i)))
       return false;
 
-  for (unsigned i = 0, e = fac.getNumEqualities(); i < e; ++i)
-    if (!isRedundantEquality(fac.getEquality(i)))
+  for (unsigned i = 0, e = poly.getNumEqualities(); i < e; ++i)
+    if (!isRedundantEquality(poly.getEquality(i)))
       return false;
 
   return true;
diff --git a/mlir/lib/Analysis/PresburgerSet.cpp b/mlir/lib/Analysis/PresburgerSet.cpp
--- a/mlir/lib/Analysis/PresburgerSet.cpp
+++ b/mlir/lib/Analysis/PresburgerSet.cpp
@@ -242,7 +242,7 @@
   simplex.appendVariable(numLocalsAdded);
 
   unsigned snapshotBeforeIntersect = simplex.getSnapshot();
-  simplex.intersectFlatAffineConstraints(sI);
+  simplex.intersectIntegerPolyhedron(sI);
 
   if (simplex.isEmpty()) {
     /// b ^ s_i is empty, so b \ s_i = b. We move directly to i + 1.
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -792,6 +792,10 @@
     return LLVM::AtomicBinOp::min;
   case AtomicRMWKind::minu:
     return LLVM::AtomicBinOp::umin;
+  case AtomicRMWKind::ori:
+    return LLVM::AtomicBinOp::_or;
+  case AtomicRMWKind::andi:
+    return LLVM::AtomicBinOp::_and;
   default:
     return llvm::None;
   }
diff --git a/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt b/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt
--- a/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt
+++ b/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_mlir_conversion_library(MLIRTosaToLinalg
   TosaToLinalg.cpp
+  TosaToLinalgNamed.cpp
+  TosaToLinalgNamedPass.cpp
   TosaToLinalgPass.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -18,6 +20,7 @@
   MLIRMath
   MLIRPass
   MLIRTensor
+  MLIRTransforms
   MLIRTosa
   MLIRTosaTransforms
   MLIRSupport
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -61,37 +61,6 @@
   return rewriter.create<mlir::SelectOp>(loc, largerThanMax, max, minOrArg);
 }
 
-static mlir::Value applyPad(Location loc, Value input, ArrayRef<int64_t> pad,
-                            Attribute padAttr, OpBuilder &rewriter) {
-  // Input should be padded if necessary.
-  if (llvm::all_of(pad, [](int64_t p) { return p == 0; }))
-    return input;
-
-  ShapedType inputTy = input.getType().cast<ShapedType>();
-  Type inputETy = inputTy.getElementType();
-  auto inputShape = inputTy.getShape();
-
-  assert((inputShape.size() * 2) == pad.size());
-
-  SmallVector<int64_t, 4> paddedShape;
-  SmallVector<OpFoldResult, 8> lowIndices;
-  SmallVector<OpFoldResult, 8> highIndices;
-  for (int i = 0, s = inputShape.size(); i < s; i++) {
-    auto lowPad = pad[i * 2];
-    auto highPad = pad[i * 2 + 1];
-    paddedShape.push_back(inputShape[i] + highPad + lowPad);
-    lowIndices.push_back(rewriter.getIndexAttr(lowPad));
-    highIndices.push_back(rewriter.getIndexAttr(highPad));
-  }
-
-  Value padValue = rewriter.create<arith::ConstantOp>(loc, padAttr);
-
-  return linalg::PadTensorOp::createPadScalarOp(
-             RankedTensorType::get(paddedShape, inputETy), input, padValue,
-             lowIndices, highIndices, /*nofold=*/false, loc, rewriter)
-      .result();
-}
-
 static SmallVector<Value> filterDynamicDims(SmallVector<Value> dynDims) {
   SmallVector<Value> filteredDims;
   for (auto dim : dynDims)
@@ -1065,510 +1034,6 @@
   }
 };
 
-class ConvConverter : public OpConversionPattern<tosa::Conv2DOp> {
-public:
-  using OpConversionPattern<tosa::Conv2DOp>::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(tosa::Conv2DOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    Location loc = op->getLoc();
-    Value input = op->getOperand(0);
-    Value weight = op->getOperand(1);
-    Value bias = op->getOperand(2);
-
-    ShapedType inputTy = input.getType().cast<ShapedType>();
-    ShapedType weightTy = weight.getType().cast<ShapedType>();
-    ShapedType biasTy = bias.getType().cast<ShapedType>();
-    ShapedType resultTy = op->getResult(0).getType().cast<ShapedType>();
-
-    Type inputETy = inputTy.getElementType();
-    Type resultETy = resultTy.getElementType();
-
-    auto padAttr = op->getAttr("pad").cast<ArrayAttr>();
-    auto strideTosaAttr = op->getAttr("stride").cast<ArrayAttr>();
-    auto dilationTosaAttr = op->getAttr("dilation").cast<ArrayAttr>();
-    bool isQuantized = op->hasAttr("quantization_info");
-
-    if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() ||
-        !biasTy.hasStaticShape() || !resultTy.hasStaticShape())
-      return rewriter.notifyMatchFailure(op,
-                                         "tosa.conv ops require static shapes");
-
-    if (inputETy.isUnsignedInteger())
-      return rewriter.notifyMatchFailure(
-          op, "tosa.conv ops does not support unsigned integer input");
-
-    auto weightShape = weightTy.getShape();
-
-    // Apply padding as necessary.
-    Attribute zeroAttr = rewriter.getZeroAttr(inputETy);
-    if (isQuantized) {
-      auto quantizationInfo =
-          op->getAttr("quantization_info").cast<tosa::ConvOpQuantizationAttr>();
-      auto iZp = quantizationInfo.input_zp().getValue().getSExtValue();
-
-      int64_t intMin =
-          APInt::getSignedMinValue(inputETy.getIntOrFloatBitWidth())
-              .getSExtValue();
-      int64_t intMax =
-          APInt::getSignedMaxValue(inputETy.getIntOrFloatBitWidth())
-              .getSExtValue();
-
-      if (iZp < intMin || iZp > intMax)
-        return rewriter.notifyMatchFailure(
-            op, "tosa.conv op quantization has zp outside of input range");
-
-      zeroAttr = rewriter.getIntegerAttr(inputETy, iZp);
-    }
-
-    llvm::SmallVector<int64_t> pad;
-    pad.resize(2, 0);
-    getValuesFromIntArrayAttribute(padAttr, pad);
-    pad.resize(pad.size() + 2, 0);
-    input = applyPad(loc, input, pad, zeroAttr, rewriter);
-
-    // Transpose the kernel to match dimension ordering of the linalg
-    // convolution operation.
-    // TODO(suderman): See if this can be efficiently folded - check whether
-    // the input is used anywhere else, if not fold the constant.
-    SmallVector<int64_t> weightPerm{1, 2, 3, 0};
-    SmallVector<int64_t> newWeightShape{weightShape[1], weightShape[2],
-                                        weightShape[3], weightShape[0]};
-    auto weightPermAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({4}, rewriter.getI64Type()), weightPerm);
-    Value weightPermValue =
-        rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
-    Type newWeightTy =
-        RankedTensorType::get(newWeightShape, weightTy.getElementType());
-    weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
-                                                weightPermValue);
-
-    Attribute resultZeroAttr = rewriter.getZeroAttr(resultETy);
-    Value initTensor = rewriter.create<linalg::InitTensorOp>(
-        loc, resultTy.getShape(), resultETy);
-    Value zero = rewriter.create<arith::ConstantOp>(loc, resultZeroAttr);
-    Value zeroTensor =
-        rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
-
-    // Extract the attributes for convolution.
-    llvm::SmallVector<int64_t> stride, dilation;
-    getValuesFromIntArrayAttribute(strideTosaAttr, stride);
-    getValuesFromIntArrayAttribute(dilationTosaAttr, dilation);
-
-    // Create the convolution op.
-    auto strideAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({2}, rewriter.getI64Type()), stride);
-    auto dilationAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({2}, rewriter.getI64Type()), dilation);
-
-    // Create maps for the bias broadcasting
-    SmallVector<AffineMap, 4> indexingMaps;
-    indexingMaps.push_back(AffineMap::get(
-        /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0,
-        {rewriter.getAffineDimExpr(3)}, rewriter.getContext()));
-    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank()));
-    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank()));
-
-    Value biasInitTensor = rewriter.create<linalg::InitTensorOp>(
-        loc, resultTy.getShape(), resultETy);
-
-    if (isQuantized) {
-      auto quantizationInfo =
-          op->getAttr("quantization_info").cast<tosa::ConvOpQuantizationAttr>();
-      auto iZp = rewriter.getI32IntegerAttr(
-          quantizationInfo.input_zp().getValue().getSExtValue());
-      auto kZp = rewriter.getI32IntegerAttr(
-          quantizationInfo.weight_zp().getValue().getSExtValue());
-
-      auto iZpVal = rewriter.create<arith::ConstantOp>(loc, iZp);
-      auto kZpVal = rewriter.create<arith::ConstantOp>(loc, kZp);
-      Value conv =
-          rewriter
-              .create<linalg::Conv2DNhwcHwcfQOp>(
-                  loc, resultTy, ValueRange{input, weight, iZpVal, kZpVal},
-                  ValueRange{zeroTensor}, strideAttr, dilationAttr)
-              ->getResult(0);
-
-      Value result =
-          rewriter
-              .create<linalg::GenericOp>(
-                  loc, resultTy, ValueRange({bias, conv}), biasInitTensor,
-                  indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()),
-                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                      ValueRange args) {
-                    Value added = nestedBuilder.create<arith::AddIOp>(
-                        loc, args[0], args[1]);
-                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
-                  })
-              .getResult(0);
-      rewriter.replaceOp(op, result);
-      return success();
-    }
-
-    Value conv = rewriter
-                     .create<linalg::Conv2DNhwcHwcfOp>(
-                         loc, resultTy, ValueRange{input, weight},
-                         ValueRange{zeroTensor}, strideAttr, dilationAttr)
-                     ->getResult(0);
-
-    Value result =
-        rewriter
-            .create<linalg::GenericOp>(
-                loc, resultTy, ValueRange({bias, conv}), biasInitTensor,
-                indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()),
-                [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                    ValueRange args) {
-                  Value added = nestedBuilder.create<arith::AddFOp>(
-                      loc, args[0], args[1]);
-                  nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
-                })
-            .getResult(0);
-
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
-class DepthwiseConvConverter
-    : public OpConversionPattern<tosa::DepthwiseConv2DOp> {
-public:
-  using OpConversionPattern<tosa::DepthwiseConv2DOp>::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(tosa::DepthwiseConv2DOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    Location loc = op->getLoc();
-    Value input = op->getOperand(0);
-    Value weight = op->getOperand(1);
-    Value bias = op->getOperand(2);
-
-    ShapedType inputTy = input.getType().cast<ShapedType>();
-    ShapedType weightTy = weight.getType().cast<ShapedType>();
-    ShapedType biasTy = bias.getType().cast<ShapedType>();
-    ShapedType resultTy = op->getResult(0).getType().cast<ShapedType>();
-
-    Type inputETy = inputTy.getElementType();
-    Type resultETy = resultTy.getElementType();
-
-    auto padAttr = op->getAttr("pad").cast<ArrayAttr>();
-    auto strideTosaAttr = op->getAttr("stride").cast<ArrayAttr>();
-    auto dilationTosaAttr = op->getAttr("dilation").cast<ArrayAttr>();
-
-    bool isQuantized = op->hasAttr("quantization_info");
-    IntegerAttr iZp;
-    IntegerAttr kZp;
-    if (isQuantized) {
-      auto quantizationInfo =
-          op->getAttr("quantization_info").cast<tosa::ConvOpQuantizationAttr>();
-      iZp = rewriter.getI32IntegerAttr(
-          quantizationInfo.input_zp().getValue().getSExtValue());
-      kZp = rewriter.getI32IntegerAttr(
-          quantizationInfo.weight_zp().getValue().getSExtValue());
-    }
-
-    if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() ||
-        !biasTy.hasStaticShape() || !resultTy.hasStaticShape())
-      return rewriter.notifyMatchFailure(op,
-                                         "tosa.conv ops require static shapes");
-
-    auto weightShape = weightTy.getShape();
-    auto resultShape = resultTy.getShape();
-
-    // Apply padding as necessary.
-    Attribute zeroAttr = rewriter.getZeroAttr(inputETy);
-    if (isQuantized) {
-      auto quantizationInfo =
-          op->getAttr("quantization_info").cast<tosa::ConvOpQuantizationAttr>();
-      auto iZp = quantizationInfo.input_zp().getValue().getSExtValue();
-
-      int64_t intMin =
-          APInt::getSignedMinValue(inputETy.getIntOrFloatBitWidth())
-              .getSExtValue();
-      int64_t intMax =
-          APInt::getSignedMaxValue(inputETy.getIntOrFloatBitWidth())
-              .getSExtValue();
-
-      if (iZp < intMin || iZp > intMax)
-        return rewriter.notifyMatchFailure(
-            op, "tosa.depthwise_conv op quantization has zp outside of input "
-                "range");
-
-      zeroAttr = rewriter.getIntegerAttr(inputETy, iZp);
-    }
-
-    llvm::SmallVector<int64_t> pad;
-    pad.resize(2, 0);
-    getValuesFromIntArrayAttribute(padAttr, pad);
-    pad.resize(pad.size() + 2, 0);
-
-    input = applyPad(loc, input, pad, zeroAttr, rewriter);
-
-    // Extract the attributes for convolution.
-    llvm::SmallVector<int64_t> stride, dilation;
-    getValuesFromIntArrayAttribute(strideTosaAttr, stride);
-    getValuesFromIntArrayAttribute(dilationTosaAttr, dilation);
-
-    // Create the convolution op.
-    auto strideAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({2}, rewriter.getI64Type()), stride);
-    auto dilationAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({2}, rewriter.getI64Type()), dilation);
-    ShapedType linalgConvTy =
-        RankedTensorType::get({resultShape[0], resultShape[1], resultShape[2],
-                               weightShape[2], weightShape[3]},
-                              resultETy);
-
-    // Broadcast the initial value to the output tensor before convolving.
-    SmallVector<AffineMap, 4> indexingMaps;
-    indexingMaps.push_back(AffineMap::get(
-        /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0,
-        {rewriter.getAffineDimExpr(3)}, rewriter.getContext()));
-    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank()));
-    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank()));
-
-    Attribute resultZeroAttr = rewriter.getZeroAttr(resultETy);
-    Value initTensor = rewriter.create<linalg::InitTensorOp>(
-        loc, linalgConvTy.getShape(), resultETy);
-    Value zero = rewriter.create<arith::ConstantOp>(loc, resultZeroAttr);
-    Value zeroTensor =
-        rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
-
-    Value biasInitTensor = rewriter.create<linalg::InitTensorOp>(
-        loc, resultTy.getShape(), resultETy);
-    if (!isQuantized) {
-      Value conv = rewriter
-                       .create<linalg::DepthwiseConv2DNhwcHwcmOp>(
-                           loc, linalgConvTy, ValueRange{input, weight},
-                           ValueRange{zeroTensor}, strideAttr, dilationAttr)
-                       .getResult(0);
-      Value convReshape = rewriter.create<tosa::ReshapeOp>(loc, resultTy, conv);
-      Value result =
-          rewriter
-              .create<linalg::GenericOp>(
-                  loc, resultTy, ValueRange({bias, convReshape}),
-                  biasInitTensor, indexingMaps,
-                  getNParallelLoopsAttrs(resultTy.getRank()),
-                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                      ValueRange args) {
-                    Value added = nestedBuilder.create<arith::AddFOp>(
-                        loc, args[0], args[1]);
-                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
-                  })
-              .getResult(0);
-      rewriter.replaceOp(op, result);
-    } else {
-      auto iZpVal = rewriter.create<arith::ConstantOp>(loc, iZp);
-      auto kZpVal = rewriter.create<arith::ConstantOp>(loc, kZp);
-      Value conv =
-          rewriter
-              .create<linalg::DepthwiseConv2DNhwcHwcmQOp>(
-                  loc, linalgConvTy, ValueRange{input, weight, iZpVal, kZpVal},
-                  ValueRange{zeroTensor}, strideAttr, dilationAttr)
-              .getResult(0);
-      Value convReshape = rewriter.create<tosa::ReshapeOp>(loc, resultTy, conv);
-      Value result =
-          rewriter
-              .create<linalg::GenericOp>(
-                  loc, resultTy, ValueRange({bias, convReshape}),
-                  biasInitTensor, indexingMaps,
-                  getNParallelLoopsAttrs(resultTy.getRank()),
-                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                      ValueRange args) {
-                    Value added = nestedBuilder.create<arith::AddIOp>(
-                        loc, args[0], args[1]);
-                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
-                  })
-              .getResult(0);
-      rewriter.replaceOp(op, result);
-    }
-    return success();
-  }
-};
-
-class MatMulConverter : public OpConversionPattern<tosa::MatMulOp> {
-public:
-  using OpConversionPattern<tosa::MatMulOp>::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(tosa::MatMulOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    Location loc = op.getLoc();
-
-    auto outputTy = op.getType().cast<ShapedType>();
-    auto outputElementTy = outputTy.getElementType();
-
-    auto firstOperandTy = op->getOperand(0).getType().cast<ShapedType>();
-    auto secondOperandTy = op->getOperand(1).getType().cast<ShapedType>();
-
-    SmallVector<Value> dynDims;
-    dynDims.resize(op->getResult(0).getType().cast<ShapedType>().getRank());
-
-    if (!firstOperandTy.hasRank() || firstOperandTy.isDynamicDim(0)) {
-      dynDims[0] = rewriter.create<tensor::DimOp>(loc, op->getOperand(0), 0);
-    }
-
-    if (!firstOperandTy.hasRank() || firstOperandTy.isDynamicDim(1)) {
-      dynDims[1] = rewriter.create<tensor::DimOp>(loc, op->getOperand(0), 1);
-    }
-
-    if (!secondOperandTy.hasRank() || secondOperandTy.isDynamicDim(2)) {
-      dynDims[2] = rewriter.create<tensor::DimOp>(loc, op->getOperand(1), 2);
-    }
-
-    SmallVector<Value> filteredDims = filterDynamicDims(dynDims);
-
-    auto zeroAttr = rewriter.getZeroAttr(outputElementTy);
-    Value zero = rewriter.create<arith::ConstantOp>(loc, zeroAttr);
-    auto initTensor = rewriter.create<linalg::InitTensorOp>(
-        loc, filteredDims, outputTy.getShape(), outputTy.getElementType());
-    Value zeroTensor =
-        rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
-    if (!op.quantization_info()) {
-      rewriter.replaceOpWithNewOp<linalg::BatchMatmulOp>(
-          op, TypeRange{op.getType()}, ValueRange{adaptor.a(), adaptor.b()},
-          ValueRange{zeroTensor});
-      return success();
-    }
-
-    auto quantizationInfo = op.quantization_info().getValue();
-    auto aZp = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI32IntegerAttr(
-                 quantizationInfo.a_zp().getValue().getSExtValue()));
-    auto bZp = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI32IntegerAttr(
-                 quantizationInfo.b_zp().getValue().getSExtValue()));
-    rewriter.replaceOpWithNewOp<linalg::QuantizedBatchMatmulOp>(
-        op, TypeRange{op.getType()},
-        ValueRange{adaptor.a(), adaptor.b(), aZp, bZp}, zeroTensor);
-
-    return success();
-  }
-};
-
-class FullyConnectedConverter
-    : public OpConversionPattern<tosa::FullyConnectedOp> {
-public:
-  using OpConversionPattern<tosa::FullyConnectedOp>::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(tosa::FullyConnectedOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    Location loc = op.getLoc();
-    auto outputTy = op.getType().cast<ShapedType>();
-    auto input = op.input();
-    auto inputTy = input.getType().cast<ShapedType>();
-
-    auto bias = op.bias();
-
-    auto weight = op.weight();
-    auto weightTy = weight.getType().cast<ShapedType>();
-    auto weightShape = weightTy.getShape();
-
-    auto outputETy = outputTy.getElementType();
-
-    SmallVector<Value> dynDims;
-    dynDims.resize(op->getResult(0).getType().cast<ShapedType>().getRank());
-
-    if (!inputTy.hasRank() || inputTy.isDynamicDim(0)) {
-      dynDims[0] = rewriter.create<tensor::DimOp>(loc, input, 0);
-    }
-
-    if (!weightTy.hasRank() || weightTy.isDynamicDim(0)) {
-      dynDims[1] = rewriter.create<tensor::DimOp>(loc, weight, 0);
-    }
-
-    SmallVector<Value> filteredDims = filterDynamicDims(dynDims);
-
-    // Creating maps for the output of MatMul and the bias
-    SmallVector<AffineMap, 4> indexingMaps;
-
-    // Broadcast the bias.
-    indexingMaps.push_back(AffineMap::get(/*dimCount=*/2, /*symbolCount=*/0,
-                                          {rewriter.getAffineDimExpr(1)},
-                                          rewriter.getContext()));
-
-    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank()));
-    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank()));
-
-    auto initTensor = rewriter.create<linalg::InitTensorOp>(
-        loc, filteredDims, outputTy.getShape(), outputTy.getElementType());
-
-    // When quantized, the input elemeny type is not the same as the output
-    Attribute resultZeroAttr = rewriter.getZeroAttr(outputETy);
-    Value zero = rewriter.create<arith::ConstantOp>(loc, resultZeroAttr);
-    Value zeroTensor =
-        rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
-
-    SmallVector<int64_t> permutation{1, 0};
-    auto permutationAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({2}, rewriter.getI64Type()), permutation);
-    Value permutationValue =
-        rewriter.create<arith::ConstantOp>(loc, permutationAttr);
-
-    SmallVector<int64_t> newWeightShape{weightShape[1], weightShape[0]};
-    Type newWeightTy =
-        RankedTensorType::get(newWeightShape, weightTy.getElementType());
-
-    Value transposedWeight = rewriter.create<tosa::TransposeOp>(
-        loc, newWeightTy, weight, permutationValue);
-
-    auto biasInitTensor =
-        rewriter
-            .create<linalg::InitTensorOp>(loc, filteredDims,
-                                          outputTy.getShape(), outputETy)
-            ->getResults();
-
-    if (!op.quantization_info()) {
-      Value matmul = rewriter
-                         .create<linalg::MatmulOp>(
-                             loc, TypeRange{op.getType()},
-                             ValueRange{input, transposedWeight}, zeroTensor)
-                         ->getResult(0);
-
-      Value result =
-          rewriter
-              .create<linalg::GenericOp>(
-                  loc, outputTy, ValueRange({bias, matmul}), biasInitTensor,
-                  indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()),
-                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                      ValueRange args) {
-                    Value added = nestedBuilder.create<arith::AddFOp>(
-                        loc, args[0], args[1]);
-                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
-                  })
-              .getResult(0);
-      rewriter.replaceOp(op, result);
-      return success();
-    }
-
-    auto quantizationInfo = op.quantization_info().getValue();
-    auto inputZp = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI32IntegerAttr(
-                 quantizationInfo.input_zp().getValue().getSExtValue()));
-    auto outputZp = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI32IntegerAttr(
-                 quantizationInfo.weight_zp().getValue().getSExtValue()));
-    Value matmul =
-        rewriter
-            .create<linalg::QuantizedMatmulOp>(
-                loc, TypeRange{op.getType()},
-                ValueRange{input, transposedWeight, inputZp, outputZp},
-                zeroTensor)
-            ->getResult(0);
-    Value result =
-        rewriter
-            .create<linalg::GenericOp>(
-                loc, outputTy, ValueRange({bias, matmul}), biasInitTensor,
-                indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()),
-                [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                    ValueRange args) {
-                  Value added = nestedBuilder.create<arith::AddIOp>(
-                      loc, args[0], args[1]);
-                  nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
-                })
-            .getResult(0);
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
 class ReshapeConverterCollapse : public OpConversionPattern<tosa::ReshapeOp> {
 public:
   using OpConversionPattern<tosa::ReshapeOp>::OpConversionPattern;
@@ -2810,277 +2275,6 @@
   }
 };
 
-class MaxPool2dConverter : public OpRewritePattern<tosa::MaxPool2dOp> {
-public:
-  using OpRewritePattern<tosa::MaxPool2dOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tosa::MaxPool2dOp op,
-                                PatternRewriter &rewriter) const final {
-    Location loc = op.getLoc();
-    Value input = op.input();
-    ShapedType inputTy = input.getType().cast<ShapedType>();
-
-    ShapedType resultTy = op.getType().template cast<ShapedType>();
-    Type resultETy = inputTy.getElementType();
-
-    if (!inputTy.hasStaticShape())
-      return failure();
-
-    // Determine what the initial value needs to be for the max pool op.
-    Attribute initialAttr;
-    if (resultETy.isF32())
-      initialAttr = rewriter.getFloatAttr(
-          resultETy,
-          APFloat::getLargest(resultETy.cast<FloatType>().getFloatSemantics(),
-                              true));
-
-    if (resultETy.isa<IntegerType>())
-      initialAttr = rewriter.getIntegerAttr(
-          resultETy,
-          APInt::getSignedMinValue(resultETy.getIntOrFloatBitWidth()));
-
-    if (!initialAttr)
-      return rewriter.notifyMatchFailure(
-          op, "Unsupported initial value for tosa.maxpool_2d op");
-
-    // Apply padding as necessary.
-    llvm::SmallVector<int64_t> pad;
-    pad.resize(2, 0);
-    getValuesFromIntArrayAttribute(op.pad(), pad);
-    pad.resize(pad.size() + 2, 0);
-    Value paddedInput = applyPad(loc, input, pad, initialAttr, rewriter);
-
-    Value initialValue = rewriter.create<arith::ConstantOp>(loc, initialAttr);
-
-    SmallVector<int64_t> kernel, stride;
-    getValuesFromIntArrayAttribute(op.kernel(), kernel);
-    getValuesFromIntArrayAttribute(op.stride(), stride);
-
-    Attribute strideAttr = rewriter.getI64VectorAttr(stride);
-    Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1});
-
-    // Create the linalg op that performs pooling.
-    Value initTensor = rewriter.create<linalg::InitTensorOp>(
-        loc, resultTy.getShape(), resultTy.getElementType());
-
-    Value filledInitTensor =
-        rewriter.create<linalg::FillOp>(loc, initialValue, initTensor).result();
-
-    Value fakeWindowDims =
-        rewriter.create<linalg::InitTensorOp>(loc, kernel, resultETy);
-
-    rewriter.replaceOpWithNewOp<linalg::PoolingNhwcMaxOp>(
-        op, ArrayRef<Type>{resultTy}, ValueRange{paddedInput, fakeWindowDims},
-        filledInitTensor, strideAttr, dilationAttr);
-    return success();
-  }
-};
-
-class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
-public:
-  using OpRewritePattern<tosa::AvgPool2dOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tosa::AvgPool2dOp op,
-                                PatternRewriter &rewriter) const final {
-    Location loc = op.getLoc();
-    Value input = op.input();
-    ShapedType inputTy = input.getType().cast<ShapedType>();
-    Type inElementTy = inputTy.getElementType();
-
-    ShapedType resultTy = op.getType().template cast<ShapedType>();
-    Type resultETy = op.getType().cast<ShapedType>().getElementType();
-
-    Type accETy =
-        inElementTy.isa<IntegerType>() ? rewriter.getI32Type() : inElementTy;
-    ShapedType accTy = resultTy.clone(accETy);
-
-    if (!inputTy.hasStaticShape())
-      return failure();
-
-    // Apply padding as necessary.
-    llvm::SmallVector<int64_t> pad;
-    pad.resize(2, 0);
-    getValuesFromIntArrayAttribute(op.pad(), pad);
-    pad.resize(pad.size() + 2, 0);
-    Attribute padAttr = rewriter.getZeroAttr(inElementTy);
-    Value paddedInput = applyPad(loc, input, pad, padAttr, rewriter);
-
-    Attribute initialAttr = rewriter.getZeroAttr(accETy);
-    Value initialValue = rewriter.create<arith::ConstantOp>(loc, initialAttr);
-
-    SmallVector<int64_t> kernel, stride;
-    getValuesFromIntArrayAttribute(op.kernel(), kernel);
-    getValuesFromIntArrayAttribute(op.stride(), stride);
-
-    Attribute strideAttr = rewriter.getI64VectorAttr(stride);
-    Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1});
-
-    // Create the linalg op that performs pooling.
-    Value poolInitTensor =
-        rewriter.create<linalg::InitTensorOp>(loc, accTy.getShape(), accETy);
-
-    Value filledInitTensor =
-        rewriter.create<linalg::FillOp>(loc, initialValue, poolInitTensor)
-            .result();
-
-    Value fakeWindowDims =
-        rewriter.create<linalg::InitTensorOp>(loc, kernel, accETy);
-
-    // Sum across the pooled region.
-    Value poolingOp = rewriter
-                          .create<linalg::PoolingNhwcSumOp>(
-                              loc, ArrayRef<Type>{accTy},
-                              ValueRange{paddedInput, fakeWindowDims},
-                              filledInitTensor, strideAttr, dilationAttr)
-                          .getResult(0);
-
-    // Normalize the summed value by the number of elements grouped in each
-    // pool.
-    auto poolingOpTy = poolingOp.getType().cast<ShapedType>();
-    auto affineMap = rewriter.getMultiDimIdentityMap(resultTy.getRank());
-
-    Value genericInitTensor = rewriter.create<linalg::InitTensorOp>(
-        loc, resultTy.getShape(), resultETy);
-
-    auto genericOp = rewriter.create<linalg::GenericOp>(
-        loc, ArrayRef<Type>({resultTy}), ValueRange{poolingOp},
-        ValueRange{genericInitTensor},
-        ArrayRef<AffineMap>({affineMap, affineMap}),
-        getNParallelLoopsAttrs(resultTy.getRank()),
-        [&](OpBuilder &b, Location loc, ValueRange args) {
-          auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-          auto one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-          auto iH = rewriter.create<arith::ConstantIndexOp>(
-              loc, poolingOpTy.getDimSize(1) - 1);
-          auto iW = rewriter.create<arith::ConstantIndexOp>(
-              loc, poolingOpTy.getDimSize(2) - 1);
-
-          // Compute the indices from either end.
-          auto y0 = rewriter.create<linalg::IndexOp>(loc, 1);
-          auto x0 = rewriter.create<linalg::IndexOp>(loc, 2);
-          auto y1 = rewriter.create<arith::SubIOp>(loc, iH, y0);
-          auto x1 = rewriter.create<arith::SubIOp>(loc, iW, x0);
-
-          // Determines what the portion of valid input is covered by the
-          // kernel.
-          auto padFn = [&](Value v, Value x, int64_t pad) -> Value {
-            if (pad == 0)
-              return v;
-
-            auto padVal = rewriter.create<arith::ConstantIndexOp>(loc, pad);
-            Value dx = rewriter.create<arith::SubIOp>(loc, x, padVal);
-
-            Value cmp = rewriter.create<arith::CmpIOp>(
-                loc, arith::CmpIPredicate::slt, dx, zero);
-            Value offset = rewriter.create<mlir::SelectOp>(loc, cmp, dx, zero);
-            return rewriter.create<arith::AddIOp>(loc, v, offset)->getResult(0);
-          };
-
-          // Compute the vertical component of coverage.
-          auto kH0 = rewriter.create<arith::ConstantIndexOp>(loc, kernel[0]);
-          auto kH1 = padFn(kH0, y0, pad[2]);
-          auto kH2 = padFn(kH1, y1, pad[3]);
-          auto kHCmp = rewriter.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::slt, kH2, one);
-          auto kH3 = rewriter.create<SelectOp>(loc, kHCmp, one, kH2);
-
-          // compute the horizontal component of coverage.
-          auto kW0 = rewriter.create<arith::ConstantIndexOp>(loc, kernel[1]);
-          auto kW1 = padFn(kW0, x0, pad[4]);
-          auto kW2 = padFn(kW1, x1, pad[5]);
-          auto kWCmp = rewriter.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::slt, kW2, one);
-          auto kW3 = rewriter.create<SelectOp>(loc, kWCmp, one, kW2);
-
-          // Compute the total number of elements and normalize.
-          Value count = rewriter.create<arith::MulIOp>(loc, kH3, kW3);
-          auto countI = rewriter.create<arith::IndexCastOp>(
-              loc, rewriter.getI32Type(), count);
-
-          // Divide by the number of summed values. For floats this is just
-          // a div however for quantized values input normalization had
-          // to be applied.
-          Value poolVal = args[0];
-          if (accETy.isa<FloatType>()) {
-            auto countF = rewriter.create<arith::SIToFPOp>(loc, accETy, countI);
-            poolVal = rewriter.create<arith::DivFOp>(loc, poolVal, countF)
-                          ->getResult(0);
-          } else {
-
-            // If we have quantization information we need to apply an offset
-            // for the input zp value.
-            if (op.quantization_info()) {
-              auto quantizationInfo = op.quantization_info().getValue();
-              auto inputZp = rewriter.create<arith::ConstantOp>(
-                  loc, quantizationInfo.input_zp());
-              Value offset =
-                  rewriter.create<arith::MulIOp>(loc, accETy, countI, inputZp);
-              poolVal =
-                  rewriter.create<arith::SubIOp>(loc, accETy, poolVal, offset);
-            }
-
-            // Compute the multiplier and shift values for the quantization
-            // normalization. Preferably we would want to compute more bits
-            // however 32-bits should be enough for compute. Honestly we
-            // should probably straight divide.
-            int64_t numerator = ((1 << 30) + 1);
-            int64_t shift = 30;
-
-            Value numeratorVal = rewriter.create<arith::ConstantOp>(
-                loc, rewriter.getI32IntegerAttr(numerator));
-            Value multiplierVal =
-                rewriter
-                    .create<arith::DivUIOp>(loc, rewriter.getI32Type(),
-                                            numeratorVal, countI)
-                    .getResult();
-            Value shiftVal = rewriter.create<arith::ConstantOp>(
-                loc, rewriter.getI8IntegerAttr(shift));
-
-            auto scaled =
-                rewriter
-                    .create<tosa::ApplyScaleOp>(
-                        loc, rewriter.getI32Type(), poolVal, multiplierVal,
-                        shiftVal, rewriter.getBoolAttr(false))
-                    .getResult();
-
-            // If we have quantization information we need to apply output
-            // zeropoint.
-            if (op.quantization_info()) {
-              auto quantizationInfo = op.quantization_info().getValue();
-              auto outputZp = rewriter.create<arith::ConstantOp>(
-                  loc, quantizationInfo.output_zp());
-              scaled = rewriter.create<arith::AddIOp>(loc, scaled, outputZp)
-                           .getResult();
-            }
-
-            // Apply Clip.
-            int64_t outBitwidth = resultETy.getIntOrFloatBitWidth();
-
-            auto min = rewriter.create<arith::ConstantIntOp>(
-                loc, APInt::getSignedMinValue(outBitwidth).getSExtValue(),
-                accETy);
-            auto max = rewriter.create<arith::ConstantIntOp>(
-                loc, APInt::getSignedMaxValue(outBitwidth).getSExtValue(),
-                accETy);
-            auto clamp = clampHelper<arith::CmpIOp>(
-                loc, scaled, min, max, arith::CmpIPredicate::slt, rewriter);
-
-            poolVal = clamp;
-            // Convert type.
-            if (resultETy != clamp.getType()) {
-              poolVal =
-                  rewriter.create<arith::TruncIOp>(loc, resultETy, poolVal);
-            }
-          }
-
-          rewriter.create<linalg::YieldOp>(loc, poolVal);
-        });
-
-    rewriter.replaceOp(op, genericOp.getResult(0));
-    return success();
-  }
-};
-
 } // namespace
 
 void mlir::tosa::populateTosaToLinalgConversionPatterns(
@@ -3132,8 +2326,6 @@
       ReduceConverter<tosa::ReduceProdOp>,
       ArgMaxConverter,
       ConcatConverter,
-      ConvConverter,
-      DepthwiseConvConverter,
       GatherConverter,
       PadConverter,
       ReshapeConverterCollapse,
@@ -3144,10 +2336,6 @@
       ReverseConverter,
       TableConverter,
       TileConverter,
-      TransposeConverter,
-      MatMulConverter,
-      MaxPool2dConverter,
-      AvgPool2dConverter,
-      FullyConnectedConverter>(patterns->getContext());
+      TransposeConverter>(patterns->getContext());
   // clang-format on
 }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -0,0 +1,885 @@
+//===- TosaToLinalgNamed.cpp - Lowering Tosa to Linalg Named Ops ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// These rewriters lower from the Tosa to the Linalg named ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#include <numeric>
+
+using namespace mlir;
+
+static SmallVector<StringRef> getNParallelLoopsAttrs(unsigned nParallelLoops) {
+  return SmallVector<StringRef>(nParallelLoops, getParallelIteratorTypeName());
+}
+
+template <typename T>
+static void getValuesFromIntArrayAttribute(ArrayAttr attr,
+                                           SmallVector<T> &arrayValues) {
+  for (Attribute val : attr.getValue()) {
+    arrayValues.push_back(val.cast<IntegerAttr>().getValue().getSExtValue());
+  }
+}
+
+template <typename T, typename P>
+static mlir::SelectOp clampHelper(Location loc, Value arg,
+                                  arith::ConstantOp min, arith::ConstantOp max,
+                                  P pred, OpBuilder &rewriter) {
+  auto smallerThanMin = rewriter.create<T>(loc, pred, arg, min);
+  auto minOrArg =
+      rewriter.create<mlir::SelectOp>(loc, smallerThanMin, min, arg);
+  auto largerThanMax = rewriter.create<T>(loc, pred, max, arg);
+  return rewriter.create<mlir::SelectOp>(loc, largerThanMax, max, minOrArg);
+}
+
+static mlir::Value applyPad(Location loc, Value input, ArrayRef<int64_t> pad,
+                            Attribute padAttr, OpBuilder &rewriter) {
+  // Input should be padded if necessary.
+  if (llvm::all_of(pad, [](int64_t p) { return p == 0; }))
+    return input;
+
+  ShapedType inputTy = input.getType().cast<ShapedType>();
+  Type inputETy = inputTy.getElementType();
+  auto inputShape = inputTy.getShape();
+
+  assert((inputShape.size() * 2) == pad.size());
+
+  SmallVector<int64_t, 4> paddedShape;
+  SmallVector<OpFoldResult, 8> lowIndices;
+  SmallVector<OpFoldResult, 8> highIndices;
+  for (int i = 0, s = inputShape.size(); i < s; i++) {
+    auto lowPad = pad[i * 2];
+    auto highPad = pad[i * 2 + 1];
+    paddedShape.push_back(inputShape[i] + highPad + lowPad);
+    lowIndices.push_back(rewriter.getIndexAttr(lowPad));
+    highIndices.push_back(rewriter.getIndexAttr(highPad));
+  }
+
+  Value padValue = rewriter.create<arith::ConstantOp>(loc, padAttr);
+
+  return linalg::PadTensorOp::createPadScalarOp(
+             RankedTensorType::get(paddedShape, inputETy), input, padValue,
+             lowIndices, highIndices, /*nofold=*/false, loc, rewriter)
+      .result();
+}
+
+static SmallVector<Value> filterDynamicDims(SmallVector<Value> dynDims) {
+  SmallVector<Value> filteredDims;
+  for (auto dim : dynDims)
+    if (dim)
+      filteredDims.push_back(dim);
+  return filteredDims;
+}
+
+namespace {
+
+class ConvConverter : public OpConversionPattern<tosa::Conv2DOp> {
+public:
+  using OpConversionPattern<tosa::Conv2DOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(tosa::Conv2DOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    Location loc = op->getLoc();
+    Value input = op->getOperand(0);
+    Value weight = op->getOperand(1);
+    Value bias = op->getOperand(2);
+
+    ShapedType inputTy = input.getType().cast<ShapedType>();
+    ShapedType weightTy = weight.getType().cast<ShapedType>();
+    ShapedType biasTy = bias.getType().cast<ShapedType>();
+    ShapedType resultTy = op->getResult(0).getType().cast<ShapedType>();
+
+    Type inputETy = inputTy.getElementType();
+    Type resultETy = resultTy.getElementType();
+
+    auto padAttr = op->getAttr("pad").cast<ArrayAttr>();
+    auto strideTosaAttr = op->getAttr("stride").cast<ArrayAttr>();
+    auto dilationTosaAttr = op->getAttr("dilation").cast<ArrayAttr>();
+    bool isQuantized = op->hasAttr("quantization_info");
+
+    if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() ||
+        !biasTy.hasStaticShape() || !resultTy.hasStaticShape())
+      return rewriter.notifyMatchFailure(op,
+                                         "tosa.conv ops require static shapes");
+
+    if (inputETy.isUnsignedInteger())
+      return rewriter.notifyMatchFailure(
+          op, "tosa.conv ops does not support unsigned integer input");
+
+    auto weightShape = weightTy.getShape();
+
+    // Apply padding as necessary.
+    Attribute zeroAttr = rewriter.getZeroAttr(inputETy);
+    if (isQuantized) {
+      auto quantizationInfo =
+          op->getAttr("quantization_info").cast<tosa::ConvOpQuantizationAttr>();
+      auto iZp = quantizationInfo.input_zp().getValue().getSExtValue();
+
+      int64_t intMin =
+          APInt::getSignedMinValue(inputETy.getIntOrFloatBitWidth())
+              .getSExtValue();
+      int64_t intMax =
+          APInt::getSignedMaxValue(inputETy.getIntOrFloatBitWidth())
+              .getSExtValue();
+
+      if (iZp < intMin || iZp > intMax)
+        return rewriter.notifyMatchFailure(
+            op, "tosa.conv op quantization has zp outside of input range");
+
+      zeroAttr = rewriter.getIntegerAttr(inputETy, iZp);
+    }
+
+    llvm::SmallVector<int64_t> pad;
+    pad.resize(2, 0);
+    getValuesFromIntArrayAttribute(padAttr, pad);
+    pad.resize(pad.size() + 2, 0);
+    input = applyPad(loc, input, pad, zeroAttr, rewriter);
+
+    // Transpose the kernel to match dimension ordering of the linalg
+    // convolution operation.
+    // TODO(suderman): See if this can be efficiently folded - check whether
+    // the input is used anywhere else, if not fold the constant.
+    SmallVector<int64_t> weightPerm{1, 2, 3, 0};
+    SmallVector<int64_t> newWeightShape{weightShape[1], weightShape[2],
+                                        weightShape[3], weightShape[0]};
+    auto weightPermAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({4}, rewriter.getI64Type()), weightPerm);
+    Value weightPermValue =
+        rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
+    Type newWeightTy =
+        RankedTensorType::get(newWeightShape, weightTy.getElementType());
+    weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
+                                                weightPermValue);
+
+    Attribute resultZeroAttr = rewriter.getZeroAttr(resultETy);
+    Value initTensor = rewriter.create<linalg::InitTensorOp>(
+        loc, resultTy.getShape(), resultETy);
+    Value zero = rewriter.create<arith::ConstantOp>(loc, resultZeroAttr);
+    Value zeroTensor =
+        rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
+
+    // Extract the attributes for convolution.
+    llvm::SmallVector<int64_t> stride, dilation;
+    getValuesFromIntArrayAttribute(strideTosaAttr, stride);
+    getValuesFromIntArrayAttribute(dilationTosaAttr, dilation);
+
+    // Create the convolution op.
+    auto strideAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({2}, rewriter.getI64Type()), stride);
+    auto dilationAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({2}, rewriter.getI64Type()), dilation);
+
+    // Create maps for the bias broadcasting
+    SmallVector<AffineMap, 4> indexingMaps;
+    indexingMaps.push_back(AffineMap::get(
+        /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0,
+        {rewriter.getAffineDimExpr(3)}, rewriter.getContext()));
+    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank()));
+    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank()));
+
+    Value biasInitTensor = rewriter.create<linalg::InitTensorOp>(
+        loc, resultTy.getShape(), resultETy);
+
+    if (isQuantized) {
+      auto quantizationInfo =
+          op->getAttr("quantization_info").cast<tosa::ConvOpQuantizationAttr>();
+      auto iZp = rewriter.getI32IntegerAttr(
+          quantizationInfo.input_zp().getValue().getSExtValue());
+      auto kZp = rewriter.getI32IntegerAttr(
+          quantizationInfo.weight_zp().getValue().getSExtValue());
+
+      auto iZpVal = rewriter.create<arith::ConstantOp>(loc, iZp);
+      auto kZpVal = rewriter.create<arith::ConstantOp>(loc, kZp);
+      Value conv =
+          rewriter
+              .create<linalg::Conv2DNhwcHwcfQOp>(
+                  loc, resultTy, ValueRange{input, weight, iZpVal, kZpVal},
+                  ValueRange{zeroTensor}, strideAttr, dilationAttr)
+              ->getResult(0);
+
+      Value result =
+          rewriter
+              .create<linalg::GenericOp>(
+                  loc, resultTy, ValueRange({bias, conv}), biasInitTensor,
+                  indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()),
+                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                      ValueRange args) {
+                    Value added = nestedBuilder.create<arith::AddIOp>(
+                        loc, args[0], args[1]);
+                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                  })
+              .getResult(0);
+      rewriter.replaceOp(op, result);
+      return success();
+    }
+
+    Value conv = rewriter
+                     .create<linalg::Conv2DNhwcHwcfOp>(
+                         loc, resultTy, ValueRange{input, weight},
+                         ValueRange{zeroTensor}, strideAttr, dilationAttr)
+                     ->getResult(0);
+
+    Value result =
+        rewriter
+            .create<linalg::GenericOp>(
+                loc, resultTy, ValueRange({bias, conv}), biasInitTensor,
+                indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()),
+                [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                    ValueRange args) {
+                  Value added = nestedBuilder.create<arith::AddFOp>(
+                      loc, args[0], args[1]);
+                  nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                })
+            .getResult(0);
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+class DepthwiseConvConverter
+    : public OpConversionPattern<tosa::DepthwiseConv2DOp> {
+public:
+  using OpConversionPattern<tosa::DepthwiseConv2DOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(tosa::DepthwiseConv2DOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    Location loc = op->getLoc();
+    Value input = op->getOperand(0);
+    Value weight = op->getOperand(1);
+    Value bias = op->getOperand(2);
+
+    ShapedType inputTy = input.getType().cast<ShapedType>();
+    ShapedType weightTy = weight.getType().cast<ShapedType>();
+    ShapedType biasTy = bias.getType().cast<ShapedType>();
+    ShapedType resultTy = op->getResult(0).getType().cast<ShapedType>();
+
+    Type inputETy = inputTy.getElementType();
+    Type resultETy = resultTy.getElementType();
+
+    auto padAttr = op->getAttr("pad").cast<ArrayAttr>();
+    auto strideTosaAttr = op->getAttr("stride").cast<ArrayAttr>();
+    auto dilationTosaAttr = op->getAttr("dilation").cast<ArrayAttr>();
+
+    bool isQuantized = op->hasAttr("quantization_info");
+    IntegerAttr iZp;
+    IntegerAttr kZp;
+    if (isQuantized) {
+      auto quantizationInfo =
+          op->getAttr("quantization_info").cast<tosa::ConvOpQuantizationAttr>();
+      iZp = rewriter.getI32IntegerAttr(
+          quantizationInfo.input_zp().getValue().getSExtValue());
+      kZp = rewriter.getI32IntegerAttr(
+          quantizationInfo.weight_zp().getValue().getSExtValue());
+    }
+
+    if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() ||
+        !biasTy.hasStaticShape() || !resultTy.hasStaticShape())
+      return rewriter.notifyMatchFailure(op,
+                                         "tosa.conv ops require static shapes");
+
+    auto weightShape = weightTy.getShape();
+    auto resultShape = resultTy.getShape();
+
+    // Apply padding as necessary.
+    Attribute zeroAttr = rewriter.getZeroAttr(inputETy);
+    if (isQuantized) {
+      auto quantizationInfo =
+          op->getAttr("quantization_info").cast<tosa::ConvOpQuantizationAttr>();
+      auto iZp = quantizationInfo.input_zp().getValue().getSExtValue();
+
+      int64_t intMin =
+          APInt::getSignedMinValue(inputETy.getIntOrFloatBitWidth())
+              .getSExtValue();
+      int64_t intMax =
+          APInt::getSignedMaxValue(inputETy.getIntOrFloatBitWidth())
+              .getSExtValue();
+
+      if (iZp < intMin || iZp > intMax)
+        return rewriter.notifyMatchFailure(
+            op, "tosa.depthwise_conv op quantization has zp outside of input "
+                "range");
+
+      zeroAttr = rewriter.getIntegerAttr(inputETy, iZp);
+    }
+
+    llvm::SmallVector<int64_t> pad;
+    pad.resize(2, 0);
+    getValuesFromIntArrayAttribute(padAttr, pad);
+    pad.resize(pad.size() + 2, 0);
+
+    input = applyPad(loc, input, pad, zeroAttr, rewriter);
+
+    // Extract the attributes for convolution.
+    llvm::SmallVector<int64_t> stride, dilation;
+    getValuesFromIntArrayAttribute(strideTosaAttr, stride);
+    getValuesFromIntArrayAttribute(dilationTosaAttr, dilation);
+
+    // Create the convolution op.
+    auto strideAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({2}, rewriter.getI64Type()), stride);
+    auto dilationAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({2}, rewriter.getI64Type()), dilation);
+    ShapedType linalgConvTy =
+        RankedTensorType::get({resultShape[0], resultShape[1], resultShape[2],
+                               weightShape[2], weightShape[3]},
+                              resultETy);
+
+    // Broadcast the initial value to the output tensor before convolving.
+    SmallVector<AffineMap, 4> indexingMaps;
+    indexingMaps.push_back(AffineMap::get(
+        /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0,
+        {rewriter.getAffineDimExpr(3)}, rewriter.getContext()));
+    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank()));
+    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank()));
+
+    Attribute resultZeroAttr = rewriter.getZeroAttr(resultETy);
+    Value initTensor = rewriter.create<linalg::InitTensorOp>(
+        loc, linalgConvTy.getShape(), resultETy);
+    Value zero = rewriter.create<arith::ConstantOp>(loc, resultZeroAttr);
+    Value zeroTensor =
+        rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
+
+    Value biasInitTensor = rewriter.create<linalg::InitTensorOp>(
+        loc, resultTy.getShape(), resultETy);
+    if (!isQuantized) {
+      Value conv = rewriter
+                       .create<linalg::DepthwiseConv2DNhwcHwcmOp>(
+                           loc, linalgConvTy, ValueRange{input, weight},
+                           ValueRange{zeroTensor}, strideAttr, dilationAttr)
+                       .getResult(0);
+      Value convReshape = rewriter.create<tosa::ReshapeOp>(
+          loc, resultTy, conv, rewriter.getI64ArrayAttr(resultTy.getShape()));
+      Value result =
+          rewriter
+              .create<linalg::GenericOp>(
+                  loc, resultTy, ValueRange({bias, convReshape}),
+                  biasInitTensor, indexingMaps,
+                  getNParallelLoopsAttrs(resultTy.getRank()),
+                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                      ValueRange args) {
+                    Value added = nestedBuilder.create<arith::AddFOp>(
+                        loc, args[0], args[1]);
+                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                  })
+              .getResult(0);
+      rewriter.replaceOp(op, result);
+    } else {
+      auto iZpVal = rewriter.create<arith::ConstantOp>(loc, iZp);
+      auto kZpVal = rewriter.create<arith::ConstantOp>(loc, kZp);
+      Value conv =
+          rewriter
+              .create<linalg::DepthwiseConv2DNhwcHwcmQOp>(
+                  loc, linalgConvTy, ValueRange{input, weight, iZpVal, kZpVal},
+                  ValueRange{zeroTensor}, strideAttr, dilationAttr)
+              .getResult(0);
+      Value convReshape = rewriter.create<tosa::ReshapeOp>(
+          loc, resultTy, conv, rewriter.getI64ArrayAttr(resultTy.getShape()));
+      Value result =
+          rewriter
+              .create<linalg::GenericOp>(
+                  loc, resultTy, ValueRange({bias, convReshape}),
+                  biasInitTensor, indexingMaps,
+                  getNParallelLoopsAttrs(resultTy.getRank()),
+                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                      ValueRange args) {
+                    Value added = nestedBuilder.create<arith::AddIOp>(
+                        loc, args[0], args[1]);
+                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                  })
+              .getResult(0);
+      rewriter.replaceOp(op, result);
+    }
+    return success();
+  }
+};
+
+class MatMulConverter : public OpConversionPattern<tosa::MatMulOp> {
+public:
+  using OpConversionPattern<tosa::MatMulOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(tosa::MatMulOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    Location loc = op.getLoc();
+
+    auto outputTy = op.getType().cast<ShapedType>();
+    auto outputElementTy = outputTy.getElementType();
+
+    auto firstOperandTy = op->getOperand(0).getType().cast<ShapedType>();
+    auto secondOperandTy = op->getOperand(1).getType().cast<ShapedType>();
+
+    SmallVector<Value> dynDims;
+    dynDims.resize(op->getResult(0).getType().cast<ShapedType>().getRank());
+
+    if (!firstOperandTy.hasRank() || firstOperandTy.isDynamicDim(0)) {
+      dynDims[0] = rewriter.create<tensor::DimOp>(loc, op->getOperand(0), 0);
+    }
+
+    if (!firstOperandTy.hasRank() || firstOperandTy.isDynamicDim(1)) {
+      dynDims[1] = rewriter.create<tensor::DimOp>(loc, op->getOperand(0), 1);
+    }
+
+    if (!secondOperandTy.hasRank() || secondOperandTy.isDynamicDim(2)) {
+      dynDims[2] = rewriter.create<tensor::DimOp>(loc, op->getOperand(1), 2);
+    }
+
+    SmallVector<Value> filteredDims = filterDynamicDims(dynDims);
+
+    auto zeroAttr = rewriter.getZeroAttr(outputElementTy);
+    Value zero = rewriter.create<arith::ConstantOp>(loc, zeroAttr);
+    auto initTensor = rewriter.create<linalg::InitTensorOp>(
+        loc, filteredDims, outputTy.getShape(), outputTy.getElementType());
+    Value zeroTensor =
+        rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
+    if (!op.quantization_info()) {
+      rewriter.replaceOpWithNewOp<linalg::BatchMatmulOp>(
+          op, TypeRange{op.getType()}, ValueRange{adaptor.a(), adaptor.b()},
+          ValueRange{zeroTensor});
+      return success();
+    }
+
+    auto quantizationInfo = op.quantization_info().getValue();
+    auto aZp = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getI32IntegerAttr(
+                 quantizationInfo.a_zp().getValue().getSExtValue()));
+    auto bZp = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getI32IntegerAttr(
+                 quantizationInfo.b_zp().getValue().getSExtValue()));
+    rewriter.replaceOpWithNewOp<linalg::QuantizedBatchMatmulOp>(
+        op, TypeRange{op.getType()},
+        ValueRange{adaptor.a(), adaptor.b(), aZp, bZp}, zeroTensor);
+
+    return success();
+  }
+};
+
+class FullyConnectedConverter
+    : public OpConversionPattern<tosa::FullyConnectedOp> {
+public:
+  using OpConversionPattern<tosa::FullyConnectedOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(tosa::FullyConnectedOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    Location loc = op.getLoc();
+    auto outputTy = op.getType().cast<ShapedType>();
+    auto input = op.input();
+    auto inputTy = input.getType().cast<ShapedType>();
+
+    auto bias = op.bias();
+
+    auto weight = op.weight();
+    auto weightTy = weight.getType().cast<ShapedType>();
+    auto weightShape = weightTy.getShape();
+
+    auto outputETy = outputTy.getElementType();
+
+    SmallVector<Value> dynDims;
+    dynDims.resize(op->getResult(0).getType().cast<ShapedType>().getRank());
+
+    if (!inputTy.hasRank() || inputTy.isDynamicDim(0)) {
+      dynDims[0] = rewriter.create<tensor::DimOp>(loc, input, 0);
+    }
+
+    if (!weightTy.hasRank() || weightTy.isDynamicDim(0)) {
+      dynDims[1] = rewriter.create<tensor::DimOp>(loc, weight, 0);
+    }
+
+    SmallVector<Value> filteredDims = filterDynamicDims(dynDims);
+
+    // Creating maps for the output of MatMul and the bias
+    SmallVector<AffineMap, 4> indexingMaps;
+
+    // Broadcast the bias.
+    indexingMaps.push_back(AffineMap::get(/*dimCount=*/2, /*symbolCount=*/0,
+                                          {rewriter.getAffineDimExpr(1)},
+                                          rewriter.getContext()));
+
+    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank()));
+    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank()));
+
+    auto initTensor = rewriter.create<linalg::InitTensorOp>(
+        loc, filteredDims, outputTy.getShape(), outputTy.getElementType());
+
+    // When quantized, the input elemeny type is not the same as the output
+    Attribute resultZeroAttr = rewriter.getZeroAttr(outputETy);
+    Value zero = rewriter.create<arith::ConstantOp>(loc, resultZeroAttr);
+    Value zeroTensor =
+        rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
+
+    SmallVector<int64_t> permutation{1, 0};
+    auto permutationAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({2}, rewriter.getI64Type()), permutation);
+    Value permutationValue =
+        rewriter.create<arith::ConstantOp>(loc, permutationAttr);
+
+    SmallVector<int64_t> newWeightShape{weightShape[1], weightShape[0]};
+    Type newWeightTy =
+        RankedTensorType::get(newWeightShape, weightTy.getElementType());
+
+    Value transposedWeight = rewriter.create<tosa::TransposeOp>(
+        loc, newWeightTy, weight, permutationValue);
+
+    auto biasInitTensor =
+        rewriter
+            .create<linalg::InitTensorOp>(loc, filteredDims,
+                                          outputTy.getShape(), outputETy)
+            ->getResults();
+
+    if (!op.quantization_info()) {
+      Value matmul = rewriter
+                         .create<linalg::MatmulOp>(
+                             loc, TypeRange{op.getType()},
+                             ValueRange{input, transposedWeight}, zeroTensor)
+                         ->getResult(0);
+
+      Value result =
+          rewriter
+              .create<linalg::GenericOp>(
+                  loc, outputTy, ValueRange({bias, matmul}), biasInitTensor,
+                  indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()),
+                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                      ValueRange args) {
+                    Value added = nestedBuilder.create<arith::AddFOp>(
+                        loc, args[0], args[1]);
+                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                  })
+              .getResult(0);
+      rewriter.replaceOp(op, result);
+      return success();
+    }
+
+    auto quantizationInfo = op.quantization_info().getValue();
+    auto inputZp = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getI32IntegerAttr(
+                 quantizationInfo.input_zp().getValue().getSExtValue()));
+    auto outputZp = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getI32IntegerAttr(
+                 quantizationInfo.weight_zp().getValue().getSExtValue()));
+    Value matmul =
+        rewriter
+            .create<linalg::QuantizedMatmulOp>(
+                loc, TypeRange{op.getType()},
+                ValueRange{input, transposedWeight, inputZp, outputZp},
+                zeroTensor)
+            ->getResult(0);
+    Value result =
+        rewriter
+            .create<linalg::GenericOp>(
+                loc, outputTy, ValueRange({bias, matmul}), biasInitTensor,
+                indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()),
+                [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                    ValueRange args) {
+                  Value added = nestedBuilder.create<arith::AddIOp>(
+                      loc, args[0], args[1]);
+                  nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                })
+            .getResult(0);
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+class MaxPool2dConverter : public OpRewritePattern<tosa::MaxPool2dOp> {
+public:
+  using OpRewritePattern<tosa::MaxPool2dOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tosa::MaxPool2dOp op,
+                                PatternRewriter &rewriter) const final {
+    Location loc = op.getLoc();
+    Value input = op.input();
+    ShapedType inputTy = input.getType().cast<ShapedType>();
+
+    ShapedType resultTy = op.getType().template cast<ShapedType>();
+    Type resultETy = inputTy.getElementType();
+
+    if (!inputTy.hasStaticShape())
+      return failure();
+
+    // Determine what the initial value needs to be for the max pool op.
+    Attribute initialAttr;
+    if (resultETy.isF32())
+      initialAttr = rewriter.getFloatAttr(
+          resultETy,
+          APFloat::getLargest(resultETy.cast<FloatType>().getFloatSemantics(),
+                              true));
+
+    if (resultETy.isa<IntegerType>())
+      initialAttr = rewriter.getIntegerAttr(
+          resultETy,
+          APInt::getSignedMinValue(resultETy.getIntOrFloatBitWidth()));
+
+    if (!initialAttr)
+      return rewriter.notifyMatchFailure(
+          op, "Unsupported initial value for tosa.maxpool_2d op");
+
+    // Apply padding as necessary.
+    llvm::SmallVector<int64_t> pad;
+    pad.resize(2, 0);
+    getValuesFromIntArrayAttribute(op.pad(), pad);
+    pad.resize(pad.size() + 2, 0);
+    Value paddedInput = applyPad(loc, input, pad, initialAttr, rewriter);
+
+    Value initialValue = rewriter.create<arith::ConstantOp>(loc, initialAttr);
+
+    SmallVector<int64_t> kernel, stride;
+    getValuesFromIntArrayAttribute(op.kernel(), kernel);
+    getValuesFromIntArrayAttribute(op.stride(), stride);
+
+    Attribute strideAttr = rewriter.getI64VectorAttr(stride);
+    Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1});
+
+    // Create the linalg op that performs pooling.
+    Value initTensor = rewriter.create<linalg::InitTensorOp>(
+        loc, resultTy.getShape(), resultTy.getElementType());
+
+    Value filledInitTensor =
+        rewriter.create<linalg::FillOp>(loc, initialValue, initTensor).result();
+
+    Value fakeWindowDims =
+        rewriter.create<linalg::InitTensorOp>(loc, kernel, resultETy);
+
+    rewriter.replaceOpWithNewOp<linalg::PoolingNhwcMaxOp>(
+        op, ArrayRef<Type>{resultTy}, ValueRange{paddedInput, fakeWindowDims},
+        filledInitTensor, strideAttr, dilationAttr);
+    return success();
+  }
+};
+
+class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
+public:
+  using OpRewritePattern<tosa::AvgPool2dOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tosa::AvgPool2dOp op,
+                                PatternRewriter &rewriter) const final {
+    Location loc = op.getLoc();
+    Value input = op.input();
+    ShapedType inputTy = input.getType().cast<ShapedType>();
+    Type inElementTy = inputTy.getElementType();
+
+    ShapedType resultTy = op.getType().template cast<ShapedType>();
+    Type resultETy = op.getType().cast<ShapedType>().getElementType();
+
+    Type accETy =
+        inElementTy.isa<IntegerType>() ? rewriter.getI32Type() : inElementTy;
+    ShapedType accTy = resultTy.clone(accETy);
+
+    if (!inputTy.hasStaticShape())
+      return failure();
+
+    // Apply padding as necessary.
+    llvm::SmallVector<int64_t> pad;
+    pad.resize(2, 0);
+    getValuesFromIntArrayAttribute(op.pad(), pad);
+    pad.resize(pad.size() + 2, 0);
+    Attribute padAttr = rewriter.getZeroAttr(inElementTy);
+    Value paddedInput = applyPad(loc, input, pad, padAttr, rewriter);
+
+    Attribute initialAttr = rewriter.getZeroAttr(accETy);
+    Value initialValue = rewriter.create<arith::ConstantOp>(loc, initialAttr);
+
+    SmallVector<int64_t> kernel, stride;
+    getValuesFromIntArrayAttribute(op.kernel(), kernel);
+    getValuesFromIntArrayAttribute(op.stride(), stride);
+
+    Attribute strideAttr = rewriter.getI64VectorAttr(stride);
+    Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1});
+
+    // Create the linalg op that performs pooling.
+    Value poolInitTensor =
+        rewriter.create<linalg::InitTensorOp>(loc, accTy.getShape(), accETy);
+
+    Value filledInitTensor =
+        rewriter.create<linalg::FillOp>(loc, initialValue, poolInitTensor)
+            .result();
+
+    Value fakeWindowDims =
+        rewriter.create<linalg::InitTensorOp>(loc, kernel, accETy);
+
+    // Sum across the pooled region.
+    Value poolingOp = rewriter
+                          .create<linalg::PoolingNhwcSumOp>(
+                              loc, ArrayRef<Type>{accTy},
+                              ValueRange{paddedInput, fakeWindowDims},
+                              filledInitTensor, strideAttr, dilationAttr)
+                          .getResult(0);
+
+    // Normalize the summed value by the number of elements grouped in each
+    // pool.
+    auto poolingOpTy = poolingOp.getType().cast<ShapedType>();
+    auto affineMap = rewriter.getMultiDimIdentityMap(resultTy.getRank());
+
+    Value genericInitTensor = rewriter.create<linalg::InitTensorOp>(
+        loc, resultTy.getShape(), resultETy);
+
+    auto genericOp = rewriter.create<linalg::GenericOp>(
+        loc, ArrayRef<Type>({resultTy}), ValueRange{poolingOp},
+        ValueRange{genericInitTensor},
+        ArrayRef<AffineMap>({affineMap, affineMap}),
+        getNParallelLoopsAttrs(resultTy.getRank()),
+        [&](OpBuilder &b, Location loc, ValueRange args) {
+          auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+          auto one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+          auto iH = rewriter.create<arith::ConstantIndexOp>(
+              loc, poolingOpTy.getDimSize(1) - 1);
+          auto iW = rewriter.create<arith::ConstantIndexOp>(
+              loc, poolingOpTy.getDimSize(2) - 1);
+
+          // Compute the indices from either end.
+          auto y0 = rewriter.create<linalg::IndexOp>(loc, 1);
+          auto x0 = rewriter.create<linalg::IndexOp>(loc, 2);
+          auto y1 = rewriter.create<arith::SubIOp>(loc, iH, y0);
+          auto x1 = rewriter.create<arith::SubIOp>(loc, iW, x0);
+
+          // Determines what the portion of valid input is covered by the
+          // kernel.
+          auto padFn = [&](Value v, Value x, int64_t pad) -> Value {
+            if (pad == 0)
+              return v;
+
+            auto padVal = rewriter.create<arith::ConstantIndexOp>(loc, pad);
+            Value dx = rewriter.create<arith::SubIOp>(loc, x, padVal);
+
+            Value cmp = rewriter.create<arith::CmpIOp>(
+                loc, arith::CmpIPredicate::slt, dx, zero);
+            Value offset = rewriter.create<mlir::SelectOp>(loc, cmp, dx, zero);
+            return rewriter.create<arith::AddIOp>(loc, v, offset)->getResult(0);
+          };
+
+          // Compute the vertical component of coverage.
+          auto kH0 = rewriter.create<arith::ConstantIndexOp>(loc, kernel[0]);
+          auto kH1 = padFn(kH0, y0, pad[2]);
+          auto kH2 = padFn(kH1, y1, pad[3]);
+          auto kHCmp = rewriter.create<arith::CmpIOp>(
+              loc, arith::CmpIPredicate::slt, kH2, one);
+          auto kH3 = rewriter.create<SelectOp>(loc, kHCmp, one, kH2);
+
+          // compute the horizontal component of coverage.
+          auto kW0 = rewriter.create<arith::ConstantIndexOp>(loc, kernel[1]);
+          auto kW1 = padFn(kW0, x0, pad[4]);
+          auto kW2 = padFn(kW1, x1, pad[5]);
+          auto kWCmp = rewriter.create<arith::CmpIOp>(
+              loc, arith::CmpIPredicate::slt, kW2, one);
+          auto kW3 = rewriter.create<SelectOp>(loc, kWCmp, one, kW2);
+
+          // Compute the total number of elements and normalize.
+          Value count = rewriter.create<arith::MulIOp>(loc, kH3, kW3);
+          auto countI = rewriter.create<arith::IndexCastOp>(
+              loc, rewriter.getI32Type(), count);
+
+          // Divide by the number of summed values. For floats this is just
+          // a div however for quantized values input normalization had
+          // to be applied.
+          Value poolVal = args[0];
+          if (accETy.isa<FloatType>()) {
+            auto countF = rewriter.create<arith::SIToFPOp>(loc, accETy, countI);
+            poolVal = rewriter.create<arith::DivFOp>(loc, poolVal, countF)
+                          ->getResult(0);
+          } else {
+
+            // If we have quantization information we need to apply an offset
+            // for the input zp value.
+            if (op.quantization_info()) {
+              auto quantizationInfo = op.quantization_info().getValue();
+              auto inputZp = rewriter.create<arith::ConstantOp>(
+                  loc, quantizationInfo.input_zp());
+              Value offset =
+                  rewriter.create<arith::MulIOp>(loc, accETy, countI, inputZp);
+              poolVal =
+                  rewriter.create<arith::SubIOp>(loc, accETy, poolVal, offset);
+            }
+
+            // Compute the multiplier and shift values for the quantization
+            // normalization. Preferably we would want to compute more bits
+            // however 32-bits should be enough for compute. Honestly we
+            // should probably straight divide.
+            int64_t numerator = ((1 << 30) + 1);
+            int64_t shift = 30;
+
+            Value numeratorVal = rewriter.create<arith::ConstantOp>(
+                loc, rewriter.getI32IntegerAttr(numerator));
+            Value multiplierVal =
+                rewriter
+                    .create<arith::DivUIOp>(loc, rewriter.getI32Type(),
+                                            numeratorVal, countI)
+                    .getResult();
+            Value shiftVal = rewriter.create<arith::ConstantOp>(
+                loc, rewriter.getI8IntegerAttr(shift));
+
+            auto scaled =
+                rewriter
+                    .create<tosa::ApplyScaleOp>(
+                        loc, rewriter.getI32Type(), poolVal, multiplierVal,
+                        shiftVal, rewriter.getBoolAttr(false))
+                    .getResult();
+
+            // If we have quantization information we need to apply output
+            // zeropoint.
+            if (op.quantization_info()) {
+              auto quantizationInfo = op.quantization_info().getValue();
+              auto outputZp = rewriter.create<arith::ConstantOp>(
+                  loc, quantizationInfo.output_zp());
+              scaled = rewriter.create<arith::AddIOp>(loc, scaled, outputZp)
+                           .getResult();
+            }
+
+            // Apply Clip.
+            int64_t outBitwidth = resultETy.getIntOrFloatBitWidth();
+
+            auto min = rewriter.create<arith::ConstantIntOp>(
+                loc, APInt::getSignedMinValue(outBitwidth).getSExtValue(),
+                accETy);
+            auto max = rewriter.create<arith::ConstantIntOp>(
+                loc, APInt::getSignedMaxValue(outBitwidth).getSExtValue(),
+                accETy);
+            auto clamp = clampHelper<arith::CmpIOp>(
+                loc, scaled, min, max, arith::CmpIPredicate::slt, rewriter);
+
+            poolVal = clamp;
+            // Convert type.
+            if (resultETy != clamp.getType()) {
+              poolVal =
+                  rewriter.create<arith::TruncIOp>(loc, resultETy, poolVal);
+            }
+          }
+
+          rewriter.create<linalg::YieldOp>(loc, poolVal);
+        });
+
+    rewriter.replaceOp(op, genericOp.getResult(0));
+    return success();
+  }
+};
+
+} // namespace
+
+void mlir::tosa::populateTosaToLinalgNamedConversionPatterns(
+    RewritePatternSet *patterns) {
+  patterns->add<
+      // clang-format off
+      ConvConverter,
+      DepthwiseConvConverter,
+      MatMulConverter,
+      MaxPool2dConverter,
+      AvgPool2dConverter,
+      FullyConnectedConverter>(patterns->getContext());
+  // clang-format on
+}
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
copy from mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
copy to mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
@@ -30,7 +30,7 @@
 using namespace mlir;
 
 namespace {
-struct TosaToLinalg : public TosaToLinalgBase<TosaToLinalg> {
+struct TosaToLinalgNamed : public TosaToLinalgNamedBase<TosaToLinalgNamed> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<arith::ArithmeticDialect, linalg::LinalgDialect,
@@ -42,31 +42,27 @@
     RewritePatternSet patterns(&getContext());
     ConversionTarget target(getContext());
     target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
-                           tensor::TensorDialect, scf::SCFDialect>();
-    target.addIllegalDialect<tosa::TosaDialect>();
+                           tosa::TosaDialect, tensor::TensorDialect,
+                           scf::SCFDialect>();
 
     // Not every TOSA op can be legalized to linalg.
-    target.addLegalOp<tosa::ApplyScaleOp>();
-    target.addLegalOp<tosa::IfOp>();
-    target.addLegalOp<tosa::ConstOp>();
-    target.addLegalOp<tosa::WhileOp>();
-    target.addLegalOp<tosa::SliceOp>();
+    target.addIllegalOp<tosa::Conv2DOp>();
+    target.addIllegalOp<tosa::DepthwiseConv2DOp>();
+    target.addIllegalOp<tosa::MaxPool2dOp>();
+    target.addIllegalOp<tosa::AvgPool2dOp>();
+    target.addIllegalOp<tosa::MatMulOp>();
+    target.addIllegalOp<tosa::FullyConnectedOp>();
 
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
 
     FuncOp func = getFunction();
-    mlir::tosa::populateTosaToLinalgConversionPatterns(&patterns);
+    mlir::tosa::populateTosaToLinalgNamedConversionPatterns(&patterns);
     if (failed(applyFullConversion(func, target, std::move(patterns))))
       signalPassFailure();
   }
 };
 } // namespace
 
-std::unique_ptr<Pass> mlir::tosa::createTosaToLinalg() {
-  return std::make_unique<TosaToLinalg>();
-}
-
-void mlir::tosa::addTosaToLinalgPasses(OpPassManager &pm) {
-  pm.addNestedPass<FuncOp>(createTosaMakeBroadcastablePass());
-  pm.addNestedPass<FuncOp>(createTosaToLinalg());
+std::unique_ptr<Pass> mlir::tosa::createTosaToLinalgNamed() {
+  return std::make_unique<TosaToLinalgNamed>();
 }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
@@ -67,6 +68,9 @@
 }
 
 void mlir::tosa::addTosaToLinalgPasses(OpPassManager &pm) {
+  pm.addNestedPass<FuncOp>(createTosaMakeBroadcastablePass());
+  pm.addNestedPass<FuncOp>(createTosaToLinalgNamed());
+  pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<FuncOp>(createTosaMakeBroadcastablePass());
   pm.addNestedPass<FuncOp>(createTosaToLinalg());
 }
diff --git a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
--- a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
+++ b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
@@ -14,6 +14,8 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 
+#include "llvm/ADT/APSInt.h"
+
 using namespace mlir;
 using namespace mlir::arith;
 
@@ -881,6 +883,18 @@
   return checkIntFloatCast<IntegerType, FloatType>(inputs, outputs);
 }
 
+OpFoldResult arith::UIToFPOp::fold(ArrayRef<Attribute> operands) {
+  if (auto lhs = operands[0].dyn_cast_or_null<IntegerAttr>()) {
+    const APInt &api = lhs.getValue();
+    FloatType floatTy = getType().cast<FloatType>();
+    APFloat apf(floatTy.getFloatSemantics(),
+                APInt::getZero(floatTy.getWidth()));
+    apf.convertFromAPInt(api, /*signed=*/false, APFloat::rmNearestTiesToEven);
+    return FloatAttr::get(floatTy, apf);
+  }
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // SIToFPOp
 //===----------------------------------------------------------------------===//
@@ -889,6 +903,17 @@
   return checkIntFloatCast<IntegerType, FloatType>(inputs, outputs);
 }
 
+OpFoldResult arith::SIToFPOp::fold(ArrayRef<Attribute> operands) {
+  if (auto lhs = operands[0].dyn_cast_or_null<IntegerAttr>()) {
+    const APInt &api = lhs.getValue();
+    FloatType floatTy = getType().cast<FloatType>();
+    APFloat apf(floatTy.getFloatSemantics(),
+                APInt::getZero(floatTy.getWidth()));
+    apf.convertFromAPInt(api, /*signed=*/true, APFloat::rmNearestTiesToEven);
+    return FloatAttr::get(floatTy, apf);
+  }
+  return {};
+}
 //===----------------------------------------------------------------------===//
 // FPToUIOp
 //===----------------------------------------------------------------------===//
@@ -897,6 +922,24 @@
   return checkIntFloatCast<FloatType, IntegerType>(inputs, outputs);
 }
 
+OpFoldResult arith::FPToUIOp::fold(ArrayRef<Attribute> operands) {
+  if (auto lhs = operands[0].dyn_cast_or_null<FloatAttr>()) {
+    const APFloat &apf = lhs.getValue();
+    IntegerType intTy = getType().cast<IntegerType>();
+    bool ignored;
+    APSInt api(intTy.getWidth(), /*unsigned=*/true);
+    if (APFloat::opInvalidOp ==
+        apf.convertToInteger(api, APFloat::rmTowardZero, &ignored)) {
+      // Undefined behavior invoked - the destination type can't represent
+      // the input constant.
+      return {};
+    }
+    return IntegerAttr::get(getType(), api);
+  }
+
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // FPToSIOp
 //===----------------------------------------------------------------------===//
@@ -905,6 +948,24 @@
   return checkIntFloatCast<FloatType, IntegerType>(inputs, outputs);
 }
 
+OpFoldResult arith::FPToSIOp::fold(ArrayRef<Attribute> operands) {
+  if (auto lhs = operands[0].dyn_cast_or_null<FloatAttr>()) {
+    const APFloat &apf = lhs.getValue();
+    IntegerType intTy = getType().cast<IntegerType>();
+    bool ignored;
+    APSInt api(intTy.getWidth(), /*unsigned=*/false);
+    if (APFloat::opInvalidOp ==
+        apf.convertToInteger(api, APFloat::rmTowardZero, &ignored)) {
+      // Undefined behavior invoked - the destination type can't represent
+      // the input constant.
+      return {};
+    }
+    return IntegerAttr::get(getType(), api);
+  }
+
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // IndexCastOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1495,28 +1495,14 @@
 /// static representation of offsets, sizes and strides. Special sentinels
 /// encode the dynamic case.
 Type SubViewOp::inferResultType(MemRefType sourceMemRefType,
-                                ArrayRef<int64_t> leadingStaticOffsets,
-                                ArrayRef<int64_t> leadingStaticSizes,
-                                ArrayRef<int64_t> leadingStaticStrides) {
-  // A subview may specify only a leading subset of offset/sizes/strides in
-  // which case we complete with offset=0, sizes from memref type and strides=1.
+                                ArrayRef<int64_t> staticOffsets,
+                                ArrayRef<int64_t> staticSizes,
+                                ArrayRef<int64_t> staticStrides) {
   unsigned rank = sourceMemRefType.getRank();
-  assert(leadingStaticOffsets.size() <= rank &&
-         "unexpected leadingStaticOffsets overflow");
-  assert(leadingStaticSizes.size() <= rank &&
-         "unexpected leadingStaticSizes overflow");
-  assert(leadingStaticStrides.size() <= rank &&
-         "unexpected leadingStaticStrides overflow");
-  auto staticOffsets = llvm::to_vector<4>(leadingStaticOffsets);
-  auto staticSizes = llvm::to_vector<4>(leadingStaticSizes);
-  auto staticStrides = llvm::to_vector<4>(leadingStaticStrides);
-  unsigned numTrailingOffsets = rank - staticOffsets.size();
-  unsigned numTrailingSizes = rank - staticSizes.size();
-  unsigned numTrailingStrides = rank - staticStrides.size();
-  staticOffsets.append(numTrailingOffsets, 0);
-  llvm::append_range(staticSizes,
-                     sourceMemRefType.getShape().take_back(numTrailingSizes));
-  staticStrides.append(numTrailingStrides, 1);
+  (void)rank;
+  assert(staticOffsets.size() == rank && "unexpected staticOffsets overflow");
+  assert(staticSizes.size() == rank && "unexpected staticSizes overflow");
+  assert(staticStrides.size() == rank && "unexpected staticStrides overflow");
 
   // Extract source offset and strides.
   int64_t sourceOffset;
@@ -1553,29 +1539,28 @@
 }
 
 Type SubViewOp::inferResultType(MemRefType sourceMemRefType,
-                                ArrayRef<OpFoldResult> leadingStaticOffsets,
-                                ArrayRef<OpFoldResult> leadingStaticSizes,
-                                ArrayRef<OpFoldResult> leadingStaticStrides) {
+                                ArrayRef<OpFoldResult> offsets,
+                                ArrayRef<OpFoldResult> sizes,
+                                ArrayRef<OpFoldResult> strides) {
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
-  dispatchIndexOpFoldResults(leadingStaticOffsets, dynamicOffsets,
-                             staticOffsets, ShapedType::kDynamicStrideOrOffset);
-  dispatchIndexOpFoldResults(leadingStaticSizes, dynamicSizes, staticSizes,
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+                             ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
                              ShapedType::kDynamicSize);
-  dispatchIndexOpFoldResults(leadingStaticStrides, dynamicStrides,
-                             staticStrides, ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
+                             ShapedType::kDynamicStrideOrOffset);
   return SubViewOp::inferResultType(sourceMemRefType, staticOffsets,
                                     staticSizes, staticStrides);
 }
 
-Type SubViewOp::inferRankReducedResultType(
-    unsigned resultRank, MemRefType sourceRankedTensorType,
-    ArrayRef<int64_t> leadingStaticOffsets,
-    ArrayRef<int64_t> leadingStaticSizes,
-    ArrayRef<int64_t> leadingStaticStrides) {
+Type SubViewOp::inferRankReducedResultType(unsigned resultRank,
+                                           MemRefType sourceRankedTensorType,
+                                           ArrayRef<int64_t> offsets,
+                                           ArrayRef<int64_t> sizes,
+                                           ArrayRef<int64_t> strides) {
   auto inferredType =
-      inferResultType(sourceRankedTensorType, leadingStaticOffsets,
-                      leadingStaticSizes, leadingStaticStrides)
+      inferResultType(sourceRankedTensorType, offsets, sizes, strides)
           .cast<MemRefType>();
   assert(inferredType.getRank() >= resultRank && "expected ");
   int rankDiff = inferredType.getRank() - resultRank;
@@ -1598,19 +1583,19 @@
   return inferredType;
 }
 
-Type SubViewOp::inferRankReducedResultType(
-    unsigned resultRank, MemRefType sourceRankedTensorType,
-    ArrayRef<OpFoldResult> leadingStaticOffsets,
-    ArrayRef<OpFoldResult> leadingStaticSizes,
-    ArrayRef<OpFoldResult> leadingStaticStrides) {
+Type SubViewOp::inferRankReducedResultType(unsigned resultRank,
+                                           MemRefType sourceRankedTensorType,
+                                           ArrayRef<OpFoldResult> offsets,
+                                           ArrayRef<OpFoldResult> sizes,
+                                           ArrayRef<OpFoldResult> strides) {
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
-  dispatchIndexOpFoldResults(leadingStaticOffsets, dynamicOffsets,
-                             staticOffsets, ShapedType::kDynamicStrideOrOffset);
-  dispatchIndexOpFoldResults(leadingStaticSizes, dynamicSizes, staticSizes,
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+                             ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
                              ShapedType::kDynamicSize);
-  dispatchIndexOpFoldResults(leadingStaticStrides, dynamicStrides,
-                             staticStrides, ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
+                             ShapedType::kDynamicStrideOrOffset);
   return SubViewOp::inferRankReducedResultType(
       resultRank, sourceRankedTensorType, staticOffsets, staticSizes,
       staticStrides);
@@ -1893,6 +1878,43 @@
                                        mixedStrides);
 }
 
+/// Helper method to check if a `subview` operation is trivially a no-op. This
+/// is the case if the all offsets are zero, all strides are 1, and the source
+/// shape is same as the size of the subview. In such cases, the subview can be
+/// folded into its source.
+static bool isTrivialSubViewOp(SubViewOp subViewOp) {
+  if (subViewOp.getSourceType().getRank() != subViewOp.getType().getRank())
+    return false;
+
+  auto mixedOffsets = subViewOp.getMixedOffsets();
+  auto mixedSizes = subViewOp.getMixedSizes();
+  auto mixedStrides = subViewOp.getMixedStrides();
+
+  // Check offsets are zero.
+  if (llvm::any_of(mixedOffsets, [](OpFoldResult ofr) {
+        Optional<int64_t> intValue = getConstantIntValue(ofr);
+        return !intValue || intValue.getValue() != 0;
+      }))
+    return false;
+
+  // Check strides are one.
+  if (llvm::any_of(mixedStrides, [](OpFoldResult ofr) {
+        Optional<int64_t> intValue = getConstantIntValue(ofr);
+        return !intValue || intValue.getValue() != 1;
+      }))
+    return false;
+
+  // Check all size values are static and matches the (static) source shape.
+  ArrayRef<int64_t> sourceShape = subViewOp.getSourceType().getShape();
+  for (auto size : llvm::enumerate(mixedSizes)) {
+    Optional<int64_t> intValue = getConstantIntValue(size.value());
+    if (!intValue || intValue.getValue() != sourceShape[size.index()])
+      return false;
+  }
+  // All conditions met. The `SubViewOp` is foldable as a no-op.
+  return true;
+}
+
 namespace {
 /// Pattern to rewrite a subview op with MemRefCast arguments.
 /// This essentially pushes memref.cast past its consuming subview when
@@ -1950,6 +1972,26 @@
     return success();
   }
 };
+
+/// Canonicalize subview ops that are no-ops. When the source shape is not same
+/// as a result shape due to use of `affine_map`.
+class TrivialSubViewOpFolder final : public OpRewritePattern<SubViewOp> {
+public:
+  using OpRewritePattern<SubViewOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SubViewOp subViewOp,
+                                PatternRewriter &rewriter) const override {
+    if (!isTrivialSubViewOp(subViewOp))
+      return failure();
+    if (subViewOp.getSourceType() == subViewOp.getType()) {
+      rewriter.replaceOp(subViewOp, subViewOp.source());
+      return success();
+    }
+    rewriter.replaceOpWithNewOp<CastOp>(subViewOp, subViewOp.source(),
+                                        subViewOp.getType());
+    return success();
+  }
+};
 } // namespace
 
 /// Return the canonical type of the result of a subview.
@@ -1975,7 +2017,7 @@
   results
       .add<OpWithOffsetSizesAndStridesConstantArgumentFolder<
                SubViewOp, SubViewReturnTypeCanonicalizer, SubViewCanonicalizer>,
-           SubViewOpMemRefCastFolder>(context);
+           SubViewOpMemRefCastFolder, TrivialSubViewOpFolder>(context);
 }
 
 OpFoldResult SubViewOp::fold(ArrayRef<Attribute> operands) {
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -155,6 +155,8 @@
   case AtomicRMWKind::mins:
   case AtomicRMWKind::minu:
   case AtomicRMWKind::muli:
+  case AtomicRMWKind::ori:
+  case AtomicRMWKind::andi:
     if (!op.getValue().getType().isa<IntegerType>())
       return op.emitOpError()
              << "with kind '" << stringifyAtomicRMWKind(op.getKind())
@@ -178,7 +180,12 @@
   case AtomicRMWKind::addf:
   case AtomicRMWKind::addi:
   case AtomicRMWKind::maxu:
+  case AtomicRMWKind::ori:
     return builder.getZeroAttr(resultType);
+  case AtomicRMWKind::andi:
+    return builder.getIntegerAttr(
+        resultType,
+        APInt::getAllOnes(resultType.cast<IntegerType>().getWidth()));
   case AtomicRMWKind::maxs:
     return builder.getIntegerAttr(
         resultType,
@@ -240,6 +247,10 @@
     return builder.create<arith::MaxUIOp>(loc, lhs, rhs);
   case AtomicRMWKind::minu:
     return builder.create<arith::MinUIOp>(loc, lhs, rhs);
+  case AtomicRMWKind::ori:
+    return builder.create<arith::OrIOp>(loc, lhs, rhs);
+  case AtomicRMWKind::andi:
+    return builder.create<arith::AndIOp>(loc, lhs, rhs);
   // TODO: Add remaining reduction operations.
   default:
     (void)emitOptionalError(loc, "Reduction operation type not supported");
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -827,38 +827,31 @@
 /// An extract_slice op result type can be fully inferred from the source type
 /// and the static representation of offsets, sizes and strides. Special
 /// sentinels encode the dynamic case.
-RankedTensorType
-ExtractSliceOp::inferResultType(RankedTensorType sourceRankedTensorType,
-                                ArrayRef<int64_t> leadingStaticOffsets,
-                                ArrayRef<int64_t> leadingStaticSizes,
-                                ArrayRef<int64_t> leadingStaticStrides) {
+RankedTensorType ExtractSliceOp::inferResultType(
+    RankedTensorType sourceRankedTensorType, ArrayRef<int64_t> staticOffsets,
+    ArrayRef<int64_t> staticSizes, ArrayRef<int64_t> staticStrides) {
   // An extract_slice op may specify only a leading subset of offset/sizes/
   // strides in which case we complete with offset=0, sizes from memref type and
   // strides=1.
   unsigned rank = sourceRankedTensorType.getRank();
-  assert(leadingStaticSizes.size() <= rank &&
-         "unexpected leadingStaticSizes overflow");
-  auto staticSizes = llvm::to_vector<4>(leadingStaticSizes);
-  unsigned numTrailingSizes = rank - staticSizes.size();
-  llvm::append_range(staticSizes, sourceRankedTensorType.getShape().take_back(
-                                      numTrailingSizes));
+  (void)rank;
+  assert(staticSizes.size() == rank &&
+         "unexpected staticSizes not equal to rank of source");
   return RankedTensorType::get(staticSizes,
                                sourceRankedTensorType.getElementType());
 }
 
-RankedTensorType
-ExtractSliceOp::inferResultType(RankedTensorType sourceRankedTensorType,
-                                ArrayRef<OpFoldResult> leadingStaticOffsets,
-                                ArrayRef<OpFoldResult> leadingStaticSizes,
-                                ArrayRef<OpFoldResult> leadingStaticStrides) {
+RankedTensorType ExtractSliceOp::inferResultType(
+    RankedTensorType sourceRankedTensorType, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides) {
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
-  dispatchIndexOpFoldResults(leadingStaticOffsets, dynamicOffsets,
-                             staticOffsets, ShapedType::kDynamicStrideOrOffset);
-  dispatchIndexOpFoldResults(leadingStaticSizes, dynamicSizes, staticSizes,
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+                             ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
                              ShapedType::kDynamicSize);
-  dispatchIndexOpFoldResults(leadingStaticStrides, dynamicStrides,
-                             staticStrides, ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
+                             ShapedType::kDynamicStrideOrOffset);
   return ExtractSliceOp::inferResultType(sourceRankedTensorType, staticOffsets,
                                          staticSizes, staticStrides);
 }
@@ -868,12 +861,10 @@
 /// sentinels encode the dynamic case.
 RankedTensorType ExtractSliceOp::inferRankReducedResultType(
     unsigned resultRank, RankedTensorType sourceRankedTensorType,
-    ArrayRef<int64_t> leadingStaticOffsets,
-    ArrayRef<int64_t> leadingStaticSizes,
-    ArrayRef<int64_t> leadingStaticStrides) {
+    ArrayRef<int64_t> offsets, ArrayRef<int64_t> sizes,
+    ArrayRef<int64_t> strides) {
   auto inferredType =
-      inferResultType(sourceRankedTensorType, leadingStaticOffsets,
-                      leadingStaticSizes, leadingStaticStrides)
+      inferResultType(sourceRankedTensorType, offsets, sizes, strides)
           .cast<RankedTensorType>();
   int rankDiff = inferredType.getRank() - resultRank;
   if (rankDiff > 0) {
@@ -892,17 +883,16 @@
 
 RankedTensorType ExtractSliceOp::inferRankReducedResultType(
     unsigned resultRank, RankedTensorType sourceRankedTensorType,
-    ArrayRef<OpFoldResult> leadingStaticOffsets,
-    ArrayRef<OpFoldResult> leadingStaticSizes,
-    ArrayRef<OpFoldResult> leadingStaticStrides) {
+    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+    ArrayRef<OpFoldResult> strides) {
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
-  dispatchIndexOpFoldResults(leadingStaticOffsets, dynamicOffsets,
-                             staticOffsets, ShapedType::kDynamicStrideOrOffset);
-  dispatchIndexOpFoldResults(leadingStaticSizes, dynamicSizes, staticSizes,
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+                             ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
                              ShapedType::kDynamicSize);
-  dispatchIndexOpFoldResults(leadingStaticStrides, dynamicStrides,
-                             staticStrides, ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
+                             ShapedType::kDynamicStrideOrOffset);
   return ExtractSliceOp::inferRankReducedResultType(
       resultRank, sourceRankedTensorType, staticOffsets, staticSizes,
       staticStrides);
@@ -919,12 +909,10 @@
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
-
                              ShapedType::kDynamicStrideOrOffset);
   dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
                              ShapedType::kDynamicSize);
   dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
-
                              ShapedType::kDynamicStrideOrOffset);
   auto sourceRankedTensorType = source.getType().cast<RankedTensorType>();
   // Structuring implementation this way avoids duplication between builders.
@@ -1225,12 +1213,10 @@
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
-
                              ShapedType::kDynamicStrideOrOffset);
   dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
                              ShapedType::kDynamicSize);
   dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
-
                              ShapedType::kDynamicStrideOrOffset);
   build(b, result, dest.getType(), source, dest, dynamicOffsets, dynamicSizes,
         dynamicStrides, b.getI64ArrayAttr(staticOffsets),
diff --git a/mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp
--- a/mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp
@@ -212,10 +212,11 @@
 }
 
 /// Drops unit dimensions from the input MemRefType.
-static MemRefType dropUnitDims(MemRefType inputType) {
-  ArrayRef<int64_t> none{};
+static MemRefType dropUnitDims(MemRefType inputType, ArrayRef<int64_t> offsets,
+                               ArrayRef<int64_t> sizes,
+                               ArrayRef<int64_t> strides) {
   Type rankReducedType = memref::SubViewOp::inferRankReducedResultType(
-      0, inputType, none, none, none);
+      0, inputType, offsets, sizes, strides);
   return canonicalizeStridedLayout(rankReducedType.cast<MemRefType>());
 }
 
@@ -226,15 +227,16 @@
                                                  Value input) {
   MemRefType inputType = input.getType().cast<MemRefType>();
   assert(inputType.hasStaticShape());
-  MemRefType resultType = dropUnitDims(inputType);
+  SmallVector<int64_t> subViewOffsets(inputType.getRank(), 0);
+  SmallVector<int64_t> subViewStrides(inputType.getRank(), 1);
+  ArrayRef<int64_t> subViewSizes = inputType.getShape();
+  MemRefType resultType =
+      dropUnitDims(inputType, subViewOffsets, subViewSizes, subViewStrides);
   if (canonicalizeStridedLayout(resultType) ==
       canonicalizeStridedLayout(inputType))
     return input;
-  SmallVector<int64_t> subviewOffsets(inputType.getRank(), 0);
-  SmallVector<int64_t> subviewStrides(inputType.getRank(), 1);
   return rewriter.create<memref::SubViewOp>(
-      loc, resultType, input, subviewOffsets, inputType.getShape(),
-      subviewStrides);
+      loc, resultType, input, subViewOffsets, subViewSizes, subViewStrides);
 }
 
 /// Returns the number of dims that aren't unit dims.
diff --git a/mlir/lib/Interfaces/ViewLikeInterface.cpp b/mlir/lib/Interfaces/ViewLikeInterface.cpp
--- a/mlir/lib/Interfaces/ViewLikeInterface.cpp
+++ b/mlir/lib/Interfaces/ViewLikeInterface.cpp
@@ -18,12 +18,12 @@
 #include "mlir/Interfaces/ViewLikeInterface.cpp.inc"
 
 LogicalResult mlir::verifyListOfOperandsOrIntegers(
-    Operation *op, StringRef name, unsigned maxNumElements, ArrayAttr attr,
+    Operation *op, StringRef name, unsigned numElements, ArrayAttr attr,
     ValueRange values, llvm::function_ref<bool(int64_t)> isDynamic) {
   /// Check static and dynamic offsets/sizes/strides does not overflow type.
-  if (attr.size() > maxNumElements)
-    return op->emitError("expected <= ")
-           << maxNumElements << " " << name << " values";
+  if (attr.size() != numElements)
+    return op->emitError("expected ")
+           << numElements << " " << name << " values";
   unsigned expectedNumDynamicEntries =
       llvm::count_if(attr.getValue(), [&](Attribute attr) {
         return isDynamic(attr.cast<IntegerAttr>().getInt());
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
@@ -49,17 +49,18 @@
 /// Create a constant string location from the MLIR Location information.
 static llvm::Constant *createSourceLocStrFromLocation(Location loc,
                                                       OpenACCIRBuilder &builder,
-                                                      StringRef name) {
+                                                      StringRef name,
+                                                      uint32_t &strLen) {
   if (auto fileLoc = loc.dyn_cast<FileLineColLoc>()) {
     StringRef fileName = fileLoc.getFilename();
     unsigned lineNo = fileLoc.getLine();
     unsigned colNo = fileLoc.getColumn();
-    return builder.getOrCreateSrcLocStr(name, fileName, lineNo, colNo);
+    return builder.getOrCreateSrcLocStr(name, fileName, lineNo, colNo, strLen);
   }
   std::string locStr;
   llvm::raw_string_ostream locOS(locStr);
   locOS << loc;
-  return builder.getOrCreateSrcLocStr(locOS.str());
+  return builder.getOrCreateSrcLocStr(locOS.str(), strLen);
 }
 
 /// Create the location struct from the operation location information.
@@ -68,20 +69,23 @@
   auto loc = op->getLoc();
   auto funcOp = op->getParentOfType<LLVM::LLVMFuncOp>();
   StringRef funcName = funcOp ? funcOp.getName() : "unknown";
+  uint32_t strLen;
   llvm::Constant *locStr =
-      createSourceLocStrFromLocation(loc, builder, funcName);
-  return builder.getOrCreateIdent(locStr);
+      createSourceLocStrFromLocation(loc, builder, funcName, strLen);
+  return builder.getOrCreateIdent(locStr, strLen);
 }
 
 /// Create a constant string representing the mapping information extracted from
 /// the MLIR location information.
 static llvm::Constant *createMappingInformation(Location loc,
                                                 OpenACCIRBuilder &builder) {
+  uint32_t strLen;
   if (auto nameLoc = loc.dyn_cast<NameLoc>()) {
     StringRef name = nameLoc.getName();
-    return createSourceLocStrFromLocation(nameLoc.getChildLoc(), builder, name);
+    return createSourceLocStrFromLocation(nameLoc.getChildLoc(), builder, name,
+                                          strLen);
   }
-  return createSourceLocStrFromLocation(loc, builder, "unknown");
+  return createSourceLocStrFromLocation(loc, builder, "unknown", strLen);
 }
 
 /// Return the runtime function used to lower the given operation.
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -448,7 +448,7 @@
   // CHECK: %[[C3_3:.*]] = llvm.mlir.constant(3 : i64) : i64
   // CHECK: llvm.insertvalue %[[C3_2]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: llvm.insertvalue %[[C3_3]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  %2 = memref.subview %0[2][3][1]: memref<5x3xf32> to memref<3x3xf32, offset: 6, strides: [3, 1]>
+  %2 = memref.subview %0[2, 0][3, 3][1, 1]: memref<5x3xf32> to memref<3x3xf32, offset: 6, strides: [3, 1]>
 
   return
 }
@@ -466,13 +466,15 @@
   // CHECK: %[[ST0:.*]] = llvm.extractvalue %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[ST1:.*]] = llvm.extractvalue %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
   // Compute and insert offset from 2 + dynamic value.
-  // CHECK: %[[OFF:.*]] = llvm.extractvalue %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
+  // CHECK: %[[OFF0:.*]] = llvm.extractvalue %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-  // CHECK: %[[MUL:.*]] = llvm.mul %[[C2]], %[[ST0]] : i64
-  // CHECK: %[[NEW_OFF:.*]] = llvm.add %[[OFF]], %[[MUL]]  : i64
+  // CHECK: %[[MUL0:.*]] = llvm.mul %[[C2]], %[[ST0]] : i64
+  // CHECK: %[[OFF1:.*]] = llvm.add %[[OFF0]], %[[MUL0]] : i64
+  // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK: %[[MUL1:.*]] = llvm.mul %[[C0]], %[[ST1]] : i64
+  // CHECK: %[[NEW_OFF:.*]] = llvm.add %[[OFF1]], %[[MUL1]]  : i64
   // CHECK: llvm.insertvalue %[[NEW_OFF]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
   // Sizes and strides @rank 1: static stride 1, dynamic size unchanged from source memref.
-  // CHECK: %[[SZ1:.*]] = llvm.extractvalue %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
   // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: llvm.insertvalue %[[C1]], %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
@@ -482,7 +484,9 @@
   // CHECK: %[[MUL:.*]] = llvm.mul %[[C1_2]], %[[ST0]]  : i64
   // CHECK: llvm.insertvalue %[[C3]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: llvm.insertvalue %[[MUL]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  %1 = memref.subview %0[2][3][1]: memref<5x?xf32> to memref<3x?xf32, offset: ?, strides: [?, 1]>
+  %c0 = arith.constant 1 : index
+  %d0 = memref.dim %0, %c0 : memref<5x?xf32>
+  %1 = memref.subview %0[2, 0][3, %d0][1, 1]: memref<5x?xf32> to memref<3x?xf32, offset: ?, strides: [?, 1]>
   return
 }
 
@@ -506,7 +510,7 @@
   // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
   // CHECK: llvm.insertvalue %[[C3]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-  %1 = memref.subview %0[1][1][1]: memref<5x3xf32> to memref<3xf32, offset: 3, strides: [1]>
+  %1 = memref.subview %0[1, 0][1, 3][1, 1]: memref<5x3xf32> to memref<3xf32, offset: 3, strides: [1]>
 
   return
 }
diff --git a/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir b/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir
--- a/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir
+++ b/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir
@@ -502,6 +502,10 @@
   // CHECK: llvm.atomicrmw umin %{{.*}}, %{{.*}} acq_rel
   atomic_rmw addf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
   // CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} acq_rel
+  atomic_rmw ori %ival, %I[%i] : (i32, memref<10xi32>) -> i32
+  // CHECK: llvm.atomicrmw _or %{{.*}}, %{{.*}} acq_rel
+  atomic_rmw andi %ival, %I[%i] : (i32, memref<10xi32>) -> i32
+  // CHECK: llvm.atomicrmw _and %{{.*}}, %{{.*}} acq_rel
   return
 }
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -0,0 +1,448 @@
+// RUN: mlir-opt --split-input-file --tosa-to-linalg-named %s -verify-diagnostics -o -| FileCheck %s
+
+// CHECK-LABEL: @matmul
+func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
+  // CHECK: [[C0:%.+]] = arith.constant 0
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6]
+  // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32>
+  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x6xf32>) outs([[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
+  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x6xf32>)  -> (tensor<1x5x6xf32>)
+  return %0 : tensor<1x5x6xf32>
+}
+
+// -----
+
+
+// CHECK-LABEL: @matmul_quantized
+func @matmul_quantized(%arg0: tensor<1x5x3xi8>, %arg1: tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>) {
+  // CHECK: [[C0:%.+]] = arith.constant 0
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6]
+  // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : i32, tensor<1x5x6xi32> -> tensor<1x5x6xi32>
+  // CHECK: [[ONE:%.+]] = arith.constant 1
+  // CHECK: [[TWO:%.+]] = arith.constant 2
+  // CHECK: linalg.quantized_batch_matmul ins(%arg0, %arg1, [[ONE]], [[TWO]] : tensor<1x5x3xi8>, tensor<1x3x6xi8>, i32, i32) outs([[FILLED]] : tensor<1x5x6xi32>) -> tensor<1x5x6xi32>
+  %0 = "tosa.matmul"(%arg0, %arg1) {quantization_info = {a_zp = 1 : i32, b_zp = 2 : i32}} : (tensor<1x5x3xi8>, tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>)
+  return %0 : tensor<1x5x6xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @matmul_dyn_batch
+func @matmul_dyn_batch(%arg0: tensor<?x5x3xf32>, %arg1: tensor<?x3x6xf32>) -> (tensor<?x5x6xf32>) {
+  // CHECK: %[[C0:.+]] = arith.constant 0
+  // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]]
+  // CHECK: %[[C0_0:.+]] = arith.constant 0
+  // CHECK: %[[INIT:.+]] = linalg.init_tensor [%[[DIM]], 5, 6]
+  // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0_0]], %[[INIT]]) : f32, tensor<?x5x6xf32> -> tensor<?x5x6xf32>
+  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<?x5x3xf32>, tensor<?x3x6xf32>) outs(%[[FILLED]] : tensor<?x5x6xf32>) -> tensor<?x5x6xf32>
+  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<?x5x3xf32>, tensor<?x3x6xf32>)  -> (tensor<?x5x6xf32>)
+  return %0 : tensor<?x5x6xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @matmul_dyn_independent_dim
+func @matmul_dyn_independent_dim(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x?xf32>) -> (tensor<1x5x?xf32>) {
+  // CHECK: %[[C2:.+]] = arith.constant 2
+  // CHECK: %[[DIM:.+]] = tensor.dim %arg1, %[[C2]]
+  // CHECK: %[[C0:.+]] = arith.constant 0
+  // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 5, %[[DIM]]]
+  // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0]], %[[INIT]]) : f32, tensor<1x5x?xf32> -> tensor<1x5x?xf32>
+  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x?xf32>) outs(%[[FILLED]] : tensor<1x5x?xf32>) -> tensor<1x5x?xf32>
+  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x?xf32>)  -> (tensor<1x5x?xf32>)
+  return %0 : tensor<1x5x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @matmul_dyn_independent_dim
+func @matmul_dyn_independent_dim(%arg0: tensor<1x5x?xf32>, %arg1: tensor<1x?x6xf32>) -> (tensor<1x5x6xf32>) {
+  // CHECK: %[[C0:.+]] = arith.constant 0
+  // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 5, 6]
+  // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0]], %[[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32>
+  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x?xf32>, tensor<1x?x6xf32>) outs(%[[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
+  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x?xf32>, tensor<1x?x6xf32>)  -> (tensor<1x5x6xf32>)
+  return %0 : tensor<1x5x6xf32>
+}
+
+// -----
+
+// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
+// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+
+// CHECK-LABEL: @fully_connected
+func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<5x6xf32>) {
+  // CHECK: [[INITT:%.+]] = linalg.init_tensor [5, 6]
+  // CHECK: [[ZERO:%.+]] = arith.constant 0
+  // CHECK: [[FILL:%.+]] = linalg.fill([[ZERO]], [[INITT]])
+  // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]>
+  // CHECK: [[TRANSPOSE:%.+]] = "tosa.transpose"(%arg1, [[PERM]])
+  // CHECK: [[INITB:%.+]] = linalg.init_tensor [5, 6]
+  // CHECK: [[MATMUL:%.+]] = linalg.matmul ins(%arg0, [[TRANSPOSE]] : tensor<5x3xf32>, tensor<3x6xf32>) outs([[FILL]] : tensor<5x6xf32>) -> tensor<5x6xf32>
+  // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xf32>, tensor<5x6xf32>) outs([[INITB]] : tensor<5x6xf32>) {
+  // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+  // CHECK:   [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32
+  // CHECK:   linalg.yield [[ADD]] : f32
+
+  %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) : (tensor<5x3xf32>, tensor<6x3xf32>, tensor<6xf32>)  -> (tensor<5x6xf32>)
+  return %0 : tensor<5x6xf32>
+}
+
+// -----
+
+// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
+// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+
+// CHECK-LABEL: @quantized_fully_connected
+func @quantized_fully_connected(%arg0: tensor<5x3xi8>, %arg1: tensor<6x3xi8>, %arg2: tensor<6xi32>) -> (tensor<5x6xi32>) {
+  // CHECK: [[INITT:%.+]] = linalg.init_tensor [5, 6]
+  // CHECK: [[ZERO:%.+]] = arith.constant 0
+  // CHECK: [[FILL:%.+]] = linalg.fill([[ZERO]], [[INITT]])
+  // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]>
+  // CHECK: [[TRANSPOSE:%.+]] = "tosa.transpose"(%arg1, [[PERM]])
+  // CHECK: [[INITB:%.+]] = linalg.init_tensor [5, 6]
+  // CHECK: [[ONE:%.+]] = arith.constant 1
+  // CHECK: [[TWO:%.+]] = arith.constant 2
+  // CHECK: [[MATMUL:%.+]] = linalg.quantized_matmul ins(%arg0, [[TRANSPOSE]], [[ONE]], [[TWO]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs([[FILL]] : tensor<5x6xi32>) -> tensor<5x6xi32>
+  // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xi32>, tensor<5x6xi32>) outs([[INITB]]
+  // CHECK: ^bb0([[IN1:%.+]]: i32, [[IN2:%.+]]: i32, [[UNUSED:%.+]]: i32):
+  // CHECK:   [[ADD:%.+]] = arith.addi
+  // CHECK:   linalg.yield [[ADD]] : i32
+  %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) {quantization_info = {input_zp = 1:i32, weight_zp = 2:i32}} : (tensor<5x3xi8>, tensor<6x3xi8>, tensor<6xi32>)  -> (tensor<5x6xi32>)
+  return %0 : tensor<5x6xi32>
+}
+
+// -----
+
+// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
+// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+
+// CHECK-LABEL: @fully_connected_dyn
+func @fully_connected_dyn(%arg0: tensor<?x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<?x6xf32>) {
+  // CHECK: %[[C0:.+]] = arith.constant 0
+  // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]]
+  // CHECK: %[[INITT:.+]] = linalg.init_tensor [%[[DIM]], 6]
+  // CHECK: %[[ZERO:.+]] = arith.constant 0
+  // CHECK: %[[FILL:.+]] = linalg.fill(%[[ZERO]], %[[INITT]])
+  // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]>
+  // CHECK: %[[TRANSPOSE:.+]] = "tosa.transpose"(%arg1, %[[PERM]])
+  // CHECK: %[[INITB:.+]] = linalg.init_tensor [%[[DIM]], 6]
+  // CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%arg0, %[[TRANSPOSE]] : tensor<?x3xf32>, tensor<3x6xf32>) outs(%[[FILL]] : tensor<?x6xf32>) -> tensor<?x6xf32>
+  // CHECK: %[[ADDED:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, %[[MATMUL]] : tensor<6xf32>, tensor<?x6xf32>) outs(%[[INITB]] : tensor<?x6xf32>) {
+  // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+  // CHECK:   %[[ADD:.+]] = arith.addf %arg3, %arg4 : f32
+  // CHECK:   linalg.yield %[[ADD]] : f32
+
+  %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) : (tensor<?x3xf32>, tensor<6x3xf32>, tensor<6xf32>)  -> (tensor<?x6xf32>)
+  return %0 : tensor<?x6xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @max_pool
+func @max_pool(%arg0: tensor<1x6x34x62xf32>) -> () {
+  // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38
+  // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 32, 62]
+  // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[CONST]], [[INIT]])
+  // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3]
+  // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, [[KERNEL]] : tensor<1x6x34x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x32x62xf32>)
+  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>)  -> (tensor<1x4x32x62xf32>)
+  return
+}
+
+// CHECK-LABEL: @max_pool_padded
+func @max_pool_padded(%arg0: tensor<1x6x34x62xf32>) -> () {
+  // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38 : f32
+  // CHECK-DAG: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 0, 0, 0] high[0, 0, 1, 0]
+  // CHECK-DAG:   linalg.yield [[CONST]]
+  // CHECK-DAG: [[INITVAL:%.+]] = arith.constant -3.40282347E+38 : f32
+  // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 33, 62]
+  // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INITVAL]], [[INIT]])
+  // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3]
+  // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x6x35x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x33x62xf32>)
+  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 1], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>)  -> (tensor<1x4x33x62xf32>)
+  return
+}
+
+// CHECK-LABEL: @max_pool_i8
+func @max_pool_i8(%arg0: tensor<1x6x34x62xi8>) -> () {
+  // CHECK: arith.constant -128
+  // CHECK: linalg.pooling_nhwc_max
+  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi8>)  -> (tensor<1x4x32x62xi8>)
+  return
+}
+
+// CHECK-LABEL: @max_pool_i16
+func @max_pool_i16(%arg0: tensor<1x6x34x62xi16>) -> () {
+  // CHECK: arith.constant -32768
+  // CHECK: linalg.pooling_nhwc_max
+  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi16>)  -> (tensor<1x4x32x62xi16>)
+  return
+}
+
+// CHECK-LABEL: @max_pool_i32
+func @max_pool_i32(%arg0: tensor<1x6x34x62xi32>) -> () {
+  // CHECK: arith.constant -2147483648
+  // CHECK: linalg.pooling_nhwc_max
+  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi32>)  -> (tensor<1x4x32x62xi32>)
+  return
+}
+// -----
+
+// CHECK-LABEL: @avg_pool
+func @avg_pool(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) {
+  // Initial piece computes the sum of the pooling region, with appropriate padding.
+  // CHECK: [[CONST:%.+]] = arith.constant 0
+  // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK: [[CONST:%.+]] = arith.constant 0
+  // CHECK: [[POOLINIT:%.+]] = linalg.init_tensor [1, 5, 33, 62]
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CONST]], [[POOLINIT]])
+  // CHECK: [[KERNEL:%.+]] = linalg.init_tensor [4, 4]
+  // CHECK: [[POOL:%.+]] = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x8x36x62xf32>, tensor<4x4xf32>) outs([[FILL]] : tensor<1x5x33x62xf32>)
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 33, 62]
+  // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[POOL]] : tensor<1x5x33x62xf32>) outs([[INIT]] : tensor<1x5x33x62xf32>)
+  // CHECK:   [[ZERO:%.0]] = arith.constant 0
+  // CHECK:   [[ONE:%.+]] = arith.constant 1
+  // CHECK:   [[HEIGHT:%.+]] = arith.constant 4
+  // CHECK:   [[WIDTH:%.+]] = arith.constant 32
+  // CHECK:   [[IDX1:%.+]] = linalg.index 1
+  // CHECK:   [[IDX2:%.+]] = linalg.index 2
+
+  // The large block below computes what portion of the kernel is within non-padded input.
+  // CHECK:   [[NY:%.+]] = arith.subi [[HEIGHT]], [[IDX1]]
+  // CHECK:   [[NX:%.+]] = arith.subi [[WIDTH]], [[IDX2]]
+  // CHECK:   [[KH:%.+]] = arith.constant 4
+  // CHECK:   [[PAD0:%.+]] = arith.constant 1
+  // CHECK:   [[SUBP0:%.+]] = arith.subi [[IDX1]], [[PAD0]]
+  // CHECK:   [[P0CMP:%.+]] = arith.cmpi slt, [[SUBP0]], [[ZERO]]
+  // CHECK:   [[SELP0:%.+]] = select [[P0CMP]], [[SUBP0]], [[ZERO]]
+  // CHECK:   [[ADDP0:%.+]] = arith.addi [[KH]], [[SELP0]]
+  // CHECK:   [[PAD1:%.+]] = arith.constant 1
+  // CHECK:   [[SUBP1:%.+]] = arith.subi [[NY]], [[PAD1]]
+  // CHECK:   [[P1CMP:%.+]] = arith.cmpi slt, [[SUBP1]], [[ZERO]]
+  // CHECK:   [[SELP1:%.+]] = select [[P1CMP]], [[SUBP1]], [[ZERO]]
+  // CHECK:   [[ADDP1:%.+]] = arith.addi [[ADDP0]], [[SELP1]]
+  // CHECK:   [[YCMP:%.+]] = arith.cmpi slt, [[ADDP1]], [[ONE]]
+  // CHECK:   [[YSEL:%.+]] = select [[YCMP]], [[ONE]], [[ADDP1]]
+  // CHECK:   [[KW:%.+]] = arith.constant 4 : index
+  // CHECK:   [[PAD2:%.+]] = arith.constant 1 : index
+  // CHECK:   [[SUBP2:%.+]] = arith.subi [[IDX2]], [[PAD2]]
+  // CHECK:   [[P2CMP:%.+]] = arith.cmpi slt, [[SUBP2]], [[ZERO]]
+  // CHECK:   [[SELP2:%.+]] = select [[P2CMP]], [[SUBP2]], [[ZERO]]
+  // CHECK:   [[ADDP2:%.+]] = arith.addi [[KW]], [[SELP2]]
+  // CHECK:   [[PAD3:%.+]] = arith.constant 1 : index
+  // CHECK:   [[SUBP3:%.+]] = arith.subi [[NX]], [[PAD3]]
+  // CHECK:   [[P3CMP:%.+]] = arith.cmpi slt, [[SUBP3]], [[ZERO]]
+  // CHECK:   [[SELP3:%.+]] = select [[P3CMP]], [[SUBP3]], [[ZERO]]
+  // CHECK:   [[ADDP3:%.+]] = arith.addi [[ADDP2]], [[SELP3]]
+  // CHECK:   [[XCMP:%.+]] = arith.cmpi slt, [[ADDP3]], [[ONE]]
+  // CHECK:   [[XSEL:%.+]] = select [[XCMP]], [[ONE]], [[ADDP3]]
+
+  // Given the valid coverage of the pooling region, normalize the summation.
+  // CHECK:   [[C:%.+]] = arith.muli [[YSEL]], [[XSEL]]
+  // CHECK:   [[CI:%.+]] = arith.index_cast [[C]]
+  // CHECK:   [[CF:%.+]] = arith.sitofp [[CI]]
+  // CHECK:   [[RESULT:%.+]] = arith.divf %arg1, [[CF]]
+  // CHECK:   linalg.yield [[RESULT]]
+  %0 = "tosa.avg_pool2d"(%arg0) {pad = [1, 1, 1, 1], kernel = [4, 4], stride = [1, 1]} : (tensor<1x6x34x62xf32>)  -> (tensor<1x5x33x62xf32>)
+  return %0 : tensor<1x5x33x62xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @avg_pool_i8
+func @avg_pool_i8(%arg0 : tensor<1x128x128x2xi8>) -> () {
+
+  // CHECK: linalg.pooling_nhwc_sum
+  // CHECK: linalg.generic
+
+  // CHECK: %[[INZP:.+]] = arith.constant -128
+  // CHECK: %[[INZP_OFF:.+]] = arith.muli %{{.+}}, %[[INZP]]
+  // CHECK: %[[OFFSETED:.+]] = arith.subi %arg1, %[[INZP_OFF]]
+  // CHECK: %[[NUMERATOR:.+]] = arith.constant 1073741825
+  // CHECK: %[[MULTIPLIER:.+]] = arith.divui %[[NUMERATOR]], %{{.+}}
+  // CHECK: %[[SHIFT:.+]] = arith.constant 30
+  // CHECK: %[[SCALE:.+]] = "tosa.apply_scale"(%{{.+}}, %[[MULTIPLIER]], %[[SHIFT]]) {double_round = false}
+  // CHECK: %[[OUTZP:.+]] = arith.constant -128
+  // CHECK: %[[OUT:.+]] = arith.addi %[[SCALE]], %[[OUTZP]]
+  // CHECK: %[[MIN:.+]] = arith.constant -128
+  // CHECK: %[[MAX:.+]] = arith.constant 127
+  // CHECK: %[[CMP_MIN:.+]] = arith.cmpi slt, %[[OUT]], %[[MIN]]
+  // CHECK: %[[CLMP_MIN:.+]] = select %[[CMP_MIN]], %[[MIN]], %[[OUT]]
+  // CHECK: %[[CMP_MAX:.+]] = arith.cmpi slt, %[[MAX]], %[[OUT]]
+  // CHECK: %[[CLMP_MAX:.+]] = select %[[CMP_MAX]], %[[MAX]], %[[CLMP_MIN]]
+  // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLMP_MAX]]
+  // CHECK: linalg.yield %[[TRUNC]]
+  %0 = "tosa.avg_pool2d"(%arg0) {kernel = [4, 4], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, output_zp = -128 : i32}, stride = [4, 4]} : (tensor<1x128x128x2xi8>) -> tensor<1x32x32x2xi8>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @avg_pool_i16
+func @avg_pool_i16(%arg0 : tensor<1x128x128x2xi16>) -> () {
+
+  // CHECK: linalg.pooling_nhwc_sum
+  // CHECK: linalg.generic
+
+  // CHECK: %[[INZP:.+]] = arith.constant -128
+  // CHECK: %[[INZP_OFF:.+]] = arith.muli %{{.+}}, %[[INZP]]
+  // CHECK: %[[OFFSETED:.+]] = arith.subi %arg1, %[[INZP_OFF]]
+  // CHECK: %[[NUMERATOR:.+]] = arith.constant 1073741825
+  // CHECK: %[[MULTIPLIER:.+]] = arith.divui %[[NUMERATOR]], %{{.+}}
+  // CHECK: %[[SHIFT:.+]] = arith.constant 30
+  // CHECK: %[[SCALE:.+]] = "tosa.apply_scale"(%{{.+}}, %[[MULTIPLIER]], %[[SHIFT]]) {double_round = false}
+  // CHECK: %[[OUTZP:.+]] = arith.constant -128
+  // CHECK: %[[OUT:.+]] = arith.addi %[[SCALE]], %[[OUTZP]]
+  // CHECK: %[[MIN:.+]] = arith.constant -32768
+  // CHECK: %[[MAX:.+]] = arith.constant 32767
+  // CHECK: %[[CMP_MIN:.+]] = arith.cmpi slt, %[[OUT]], %[[MIN]]
+  // CHECK: %[[CLMP_MIN:.+]] = select %[[CMP_MIN]], %[[MIN]], %[[OUT]]
+  // CHECK: %[[CMP_MAX:.+]] = arith.cmpi slt, %[[MAX]], %[[OUT]]
+  // CHECK: %[[CLMP_MAX:.+]] = select %[[CMP_MAX]], %[[MAX]], %[[CLMP_MIN]]
+  // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLMP_MAX]]
+  // CHECK: linalg.yield %[[TRUNC]]
+  %0 = "tosa.avg_pool2d"(%arg0) {kernel = [4, 4], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, output_zp = -128 : i32}, stride = [4, 4]} : (tensor<1x128x128x2xi16>) -> tensor<1x32x32x2xi16>
+  return
+}
+
+// -----
+
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
+// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+
+// CHECK-LABEL: @conv2d_f32
+func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
+  // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 2, 3, 0]>
+  // CHECK: %[[W:.+]] = "tosa.transpose"(%arg1, %[[PERM]])
+  // CHECK: %[[M_IN:.+]] = linalg.init_tensor [1, 45, 40, 28]
+  // CHECK: %[[CST:.+]] = arith.constant 0
+  // CHECK: %[[FILL:.+]] = linalg.fill
+  // CHECK: %[[B_IN:.+]] = linalg.init_tensor [1, 45, 40, 28]
+  // CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[W]] : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%[[FILL]] : tensor<1x45x40x28xf32>)
+  // CHECK: %[[B:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[CONV]] : tensor<28xf32>, tensor<1x45x40x28xf32>) outs(%[[B_IN]] : tensor<1x45x40x28xf32>)
+  // CHECK:   arith.addf
+  // CHECK:   linalg.yield
+  %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [0, 0, 0, 0], stride = [1, 1], dilation = [2, 1]} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>)  -> (tensor<1x45x40x28xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @conv2d_padded_f32
+func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () {
+  // CHECK: %[[C0:.+]] = arith.constant 0
+  // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK:   linalg.yield %[[C0]]
+  // CHECK: linalg.conv_2d_nhwc_hwcf
+  %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [1, 1, 1, 1], stride = [1, 1], dilation = [2, 1]} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>)  -> (tensor<1x45x40x28xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @conv2d_quant
+func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1xi8>, %arg2 : tensor<1024xi32>) -> () {
+  // CHECK:   %[[C22:.+]] = arith.constant -22
+  // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK:   linalg.yield %[[C22]]
+  // CHECK: linalg.conv_2d_nhwc_hwcf_q
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = -22 : i32, weight_zp = 42 : i32}, stride = [1, 1]} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32>
+  return
+}
+
+// -----
+
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
+// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+
+// CHECK-LABEL: @depthwise_conv
+func @depthwise_conv(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () {
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 3, 11]
+  // CHECK: [[CST0:%.+]] = arith.constant 0
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
+  // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 5, 5, 33]
+  // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>)
+  // CHECK: [[COLLAPSED:%.+]] = "tosa.reshape"([[DEPTH]]) {new_shape = [1, 5, 5, 33]}
+  // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) {
+  // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):  // no predecessors
+  // CHECK:   [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32
+  // CHECK:   linalg.yield [[ADD]] : f32
+  // CHECK: } -> tensor<1x5x5x33xf32>
+  %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [1, 1], dilation = [1, 1] } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> (tensor<1x5x5x33xf32>)
+  return
+}
+
+// -----
+
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
+// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+
+// CHECK-LABEL: @depthwise_conv_strides
+func @depthwise_conv_strides(%arg0 : tensor<1x11x9x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () {
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 3, 11]
+  // CHECK: [[CST0:%.+]] = arith.constant 0
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
+  // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 5, 5, 33]
+  // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>)
+  // CHECK: [[COLLAPSED:%.+]] = "tosa.reshape"([[DEPTH]]) {new_shape = [1, 5, 5, 33]}
+  // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) {
+  // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):  // no predecessors
+  // CHECK:   [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32
+  // CHECK:   linalg.yield [[ADD]] : f32
+  // CHECK: } -> tensor<1x5x5x33xf32>
+  %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [2, 2], dilation = [1, 1] } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> (tensor<1x5x5x33xf32>)
+  return
+}
+
+// -----
+
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
+// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+
+// CHECK-LABEL: @depthwise_conv_quant
+func @depthwise_conv_quant(%arg0 : tensor<1x12x12x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () {
+  // CHECK: [[PADV:%.+]] = arith.constant -128
+  // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK:   linalg.yield [[PADV]]
+
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 12, 12, 4, 128]
+  // CHECK: [[CST0:%.+]] = arith.constant 0
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
+  // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 12, 12, 512]
+  // CHECK: [[C128:%.+]] = arith.constant -128
+  // CHECK: [[C42:%.+]] = arith.constant 42
+  // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins([[PAD]], %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x12x12x4x128xi32>)
+  // CHECK: [[COLLAPSED:%.+]] = "tosa.reshape"([[DEPTH]]) {new_shape = [1, 12, 12, 512]}
+  // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x12x12x512xi32>) outs([[OUT]] : tensor<1x12x12x512xi32>) {
+  // CHECK: ^bb0(%arg3: i32, %arg4: i32, %arg5: i32):  // no predecessors
+  // CHECK:   [[ADD:%.+]] = arith.addi %arg3, %arg4 : i32
+  // CHECK:   linalg.yield [[ADD]] : i32
+  // CHECK: } -> tensor<1x12x12x512xi32>
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = [1, 1, 1, 1], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1], dilation = [1, 1] } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x12x12x512xi32>
+  return
+}
+
+// -----
+
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
+// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+
+// CHECK-LABEL: @depthwise_conv_quant_dilations
+func @depthwise_conv_quant_dilations(%arg0 : tensor<1x14x14x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () {
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 10, 10, 4, 128]
+  // CHECK: [[CST0:%.+]] = arith.constant 0
+  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
+  // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 10, 10, 512]
+  // CHECK: [[C128:%.+]] = arith.constant -128
+  // CHECK: [[C42:%.+]] = arith.constant 42
+  // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x10x10x4x128xi32>)
+  // CHECK: [[COLLAPSED:%.+]] = "tosa.reshape"([[DEPTH]]) {new_shape = [1, 10, 10, 512]}
+  // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x10x10x512xi32>) outs([[OUT]] : tensor<1x10x10x512xi32>) {
+  // CHECK: ^bb0(%arg3: i32, %arg4: i32, %arg5: i32):  // no predecessors
+  // CHECK:   [[ADD:%.+]] = arith.addi %arg3, %arg4 : i32
+  // CHECK:   linalg.yield [[ADD]] : i32
+  // CHECK: } -> tensor<1x10x10x512xi32>
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1], dilation = [2, 2] } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x10x10x512xi32>
+  return
+}
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -1064,154 +1064,6 @@
 
 // -----
 
-
-// CHECK-LABEL: @matmul
-func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
-  // CHECK: [[C0:%.+]] = arith.constant 0
-  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6]
-  // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32>
-  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x6xf32>) outs([[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
-  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x6xf32>)  -> (tensor<1x5x6xf32>)
-  return %0 : tensor<1x5x6xf32>
-}
-
-// -----
-
-
-// CHECK-LABEL: @matmul_quantized
-func @matmul_quantized(%arg0: tensor<1x5x3xi8>, %arg1: tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>) {
-  // CHECK: [[C0:%.+]] = arith.constant 0
-  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 6]
-  // CHECK: [[FILLED:%.+]] = linalg.fill([[C0]], [[INIT]]) : i32, tensor<1x5x6xi32> -> tensor<1x5x6xi32>
-  // CHECK: [[ONE:%.+]] = arith.constant 1
-  // CHECK: [[TWO:%.+]] = arith.constant 2
-  // CHECK: linalg.quantized_batch_matmul ins(%arg0, %arg1, [[ONE]], [[TWO]] : tensor<1x5x3xi8>, tensor<1x3x6xi8>, i32, i32) outs([[FILLED]] : tensor<1x5x6xi32>) -> tensor<1x5x6xi32>
-  %0 = "tosa.matmul"(%arg0, %arg1) {quantization_info = {a_zp = 1 : i32, b_zp = 2 : i32}} : (tensor<1x5x3xi8>, tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>)
-  return %0 : tensor<1x5x6xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @matmul_dyn_batch
-func @matmul_dyn_batch(%arg0: tensor<?x5x3xf32>, %arg1: tensor<?x3x6xf32>) -> (tensor<?x5x6xf32>) {
-  // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]]
-  // CHECK: %[[C0_0:.+]] = arith.constant 0
-  // CHECK: %[[INIT:.+]] = linalg.init_tensor [%[[DIM]], 5, 6]
-  // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0_0]], %[[INIT]]) : f32, tensor<?x5x6xf32> -> tensor<?x5x6xf32>
-  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<?x5x3xf32>, tensor<?x3x6xf32>) outs(%[[FILLED]] : tensor<?x5x6xf32>) -> tensor<?x5x6xf32>
-  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<?x5x3xf32>, tensor<?x3x6xf32>)  -> (tensor<?x5x6xf32>)
-  return %0 : tensor<?x5x6xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @matmul_dyn_independent_dim
-func @matmul_dyn_independent_dim(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x?xf32>) -> (tensor<1x5x?xf32>) {
-  // CHECK: %[[C2:.+]] = arith.constant 2
-  // CHECK: %[[DIM:.+]] = tensor.dim %arg1, %[[C2]]
-  // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 5, %[[DIM]]]
-  // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0]], %[[INIT]]) : f32, tensor<1x5x?xf32> -> tensor<1x5x?xf32>
-  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x?xf32>) outs(%[[FILLED]] : tensor<1x5x?xf32>) -> tensor<1x5x?xf32>
-  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x?xf32>)  -> (tensor<1x5x?xf32>)
-  return %0 : tensor<1x5x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @matmul_dyn_independent_dim
-func @matmul_dyn_independent_dim(%arg0: tensor<1x5x?xf32>, %arg1: tensor<1x?x6xf32>) -> (tensor<1x5x6xf32>) {
-  // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 5, 6]
-  // CHECK: %[[FILLED:.+]] = linalg.fill(%[[C0]], %[[INIT]]) : f32, tensor<1x5x6xf32> -> tensor<1x5x6xf32>
-  // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x?xf32>, tensor<1x?x6xf32>) outs(%[[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
-  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x?xf32>, tensor<1x?x6xf32>)  -> (tensor<1x5x6xf32>)
-  return %0 : tensor<1x5x6xf32>
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d1)>
-
-// CHECK-LABEL: @fully_connected
-func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<5x6xf32>) {
-  // CHECK: [[INITT:%.+]] = linalg.init_tensor [5, 6]
-  // CHECK: [[ZERO:%.+]] = arith.constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[ZERO]], [[INITT]])
-  // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]>
-  // CHECK: [[INITT:%.+]] = linalg.init_tensor [3, 6]
-  // CHECK: [[TRANSPOSE:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<6x3xf32>) outs([[INITT]] : tensor<3x6xf32>) {
-  // CHECK: ^bb0([[IN:%.+]]: f32, [[UNUSED:%.+]]: f32):
-  // CHECK:   linalg.yield [[IN]] : f32
-  // CHECK: [[INITB:%.+]] = linalg.init_tensor [5, 6]
-  // CHECK: [[MATMUL:%.+]] = linalg.matmul ins(%arg0, [[TRANSPOSE]] : tensor<5x3xf32>, tensor<3x6xf32>) outs([[FILL]] : tensor<5x6xf32>) -> tensor<5x6xf32>
-  // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xf32>, tensor<5x6xf32>) outs([[INITB]] : tensor<5x6xf32>) {
-  // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-  // CHECK:   [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32
-  // CHECK:   linalg.yield [[ADD]] : f32
-
-  %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) : (tensor<5x3xf32>, tensor<6x3xf32>, tensor<6xf32>)  -> (tensor<5x6xf32>)
-  return %0 : tensor<5x6xf32>
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d1)>
-
-// CHECK-LABEL: @quantized_fully_connected
-func @quantized_fully_connected(%arg0: tensor<5x3xi8>, %arg1: tensor<6x3xi8>, %arg2: tensor<6xi32>) -> (tensor<5x6xi32>) {
-  // CHECK: [[INITT:%.+]] = linalg.init_tensor [5, 6]
-  // CHECK: [[ZERO:%.+]] = arith.constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[ZERO]], [[INITT]])
-  // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]>
-  // CHECK: [[INITT:%.+]] = linalg.init_tensor [3, 6]
-  // CHECK: [[TRANSPOSE:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<6x3xi8>) outs([[INITT]] : tensor<3x6xi8>) {
-  // CHECK: ^bb0([[IN:%.+]]: i8, [[UNUSED:%.+]]: i8):
-  // CHECK:   linalg.yield [[IN]] : i8
-  // CHECK: [[INITB:%.+]] = linalg.init_tensor [5, 6]
-  // CHECK: [[ONE:%.+]] = arith.constant 1
-  // CHECK: [[TWO:%.+]] = arith.constant 2
-  // CHECK: [[MATMUL:%.+]] = linalg.quantized_matmul ins(%arg0, [[TRANSPOSE]], [[ONE]], [[TWO]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs([[FILL]] : tensor<5x6xi32>) -> tensor<5x6xi32>
-  // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xi32>, tensor<5x6xi32>) outs([[INITB]]
-  // CHECK: ^bb0([[IN1:%.+]]: i32, [[IN2:%.+]]: i32, [[UNUSED:%.+]]: i32):
-  // CHECK:   [[ADD:%.+]] = arith.addi
-  // CHECK:   linalg.yield [[ADD]] : i32
-  %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) {quantization_info = {input_zp = 1:i32, weight_zp = 2:i32}} : (tensor<5x3xi8>, tensor<6x3xi8>, tensor<6xi32>)  -> (tensor<5x6xi32>)
-  return %0 : tensor<5x6xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @fully_connected_dyn
-func @fully_connected_dyn(%arg0: tensor<?x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<?x6xf32>) {
-  // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]]
-  // CHECK: %[[INITT:.+]] = linalg.init_tensor [%[[DIM]], 6]
-  // CHECK: %[[ZERO:.+]] = arith.constant 0
-  // CHECK: %[[FILL:.+]] = linalg.fill(%[[ZERO]], %[[INITT]])
-  // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]>
-  // CHECK: %[[INITT:.+]] = linalg.init_tensor [3, 6]
-  // CHECK: %[[TRANSPOSE:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<6x3xf32>) outs(%[[INITT]] : tensor<3x6xf32>) {
-  // CHECK: ^bb0(%[[IN:.+]]: f32, %[[UNUSED:.+]]: f32):
-  // CHECK:   linalg.yield %[[IN]] : f32
-  // CHECK: %[[INITB:.+]] = linalg.init_tensor [%[[DIM]], 6]
-  // CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%arg0, %[[TRANSPOSE]] : tensor<?x3xf32>, tensor<3x6xf32>) outs(%[[FILL]] : tensor<?x6xf32>) -> tensor<?x6xf32>
-  // CHECK: %[[ADDED:.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, %[[MATMUL]] : tensor<6xf32>, tensor<?x6xf32>) outs(%[[INITB]] : tensor<?x6xf32>) {
-  // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-  // CHECK:   %[[ADD:.+]] = arith.addf %arg3, %arg4 : f32
-  // CHECK:   linalg.yield %[[ADD]] : f32
-
-  %0 = "tosa.fully_connected"(%arg0, %arg1, %arg2) : (tensor<?x3xf32>, tensor<6x3xf32>, tensor<6xf32>)  -> (tensor<?x6xf32>)
-  return %0 : tensor<?x6xf32>
-}
-
-// -----
-
 func @pad_float(%arg0 : tensor<1x2xf32>) -> (tensor<4x9xf32>) {
   %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   // TODO: Output contains multiple "arith.constant 1 : index".
@@ -1395,318 +1247,6 @@
 
 // -----
 
-// CHECK-LABEL: @max_pool
-func @max_pool(%arg0: tensor<1x6x34x62xf32>) -> () {
-  // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38
-  // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 32, 62]
-  // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[CONST]], [[INIT]])
-  // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3]
-  // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, [[KERNEL]] : tensor<1x6x34x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x32x62xf32>)
-  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>)  -> (tensor<1x4x32x62xf32>)
-  return
-}
-
-// CHECK-LABEL: @max_pool_padded
-func @max_pool_padded(%arg0: tensor<1x6x34x62xf32>) -> () {
-  // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38 : f32
-  // CHECK-DAG: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 0, 0, 0] high[0, 0, 1, 0]
-  // CHECK-DAG:   linalg.yield [[CONST]]
-  // CHECK-DAG: [[INITVAL:%.+]] = arith.constant -3.40282347E+38 : f32
-  // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 33, 62]
-  // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INITVAL]], [[INIT]])
-  // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3]
-  // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x6x35x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x33x62xf32>)
-  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 1], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>)  -> (tensor<1x4x33x62xf32>)
-  return
-}
-
-// CHECK-LABEL: @max_pool_i8
-func @max_pool_i8(%arg0: tensor<1x6x34x62xi8>) -> () {
-  // CHECK: arith.constant -128
-  // CHECK: linalg.pooling_nhwc_max
-  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi8>)  -> (tensor<1x4x32x62xi8>)
-  return
-}
-
-// CHECK-LABEL: @max_pool_i16
-func @max_pool_i16(%arg0: tensor<1x6x34x62xi16>) -> () {
-  // CHECK: arith.constant -32768
-  // CHECK: linalg.pooling_nhwc_max
-  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi16>)  -> (tensor<1x4x32x62xi16>)
-  return
-}
-
-// CHECK-LABEL: @max_pool_i32
-func @max_pool_i32(%arg0: tensor<1x6x34x62xi32>) -> () {
-  // CHECK: arith.constant -2147483648
-  // CHECK: linalg.pooling_nhwc_max
-  %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 0], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xi32>)  -> (tensor<1x4x32x62xi32>)
-  return
-}
-// -----
-
-// CHECK-LABEL: @avg_pool
-func @avg_pool(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) {
-  // Initial piece computes the sum of the pooling region, with appropriate padding.
-  // CHECK: [[CONST:%.+]] = arith.constant 0
-  // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
-  // CHECK: [[CONST:%.+]] = arith.constant 0
-  // CHECK: [[POOLINIT:%.+]] = linalg.init_tensor [1, 5, 33, 62]
-  // CHECK: [[FILL:%.+]] = linalg.fill([[CONST]], [[POOLINIT]])
-  // CHECK: [[KERNEL:%.+]] = linalg.init_tensor [4, 4]
-  // CHECK: [[POOL:%.+]] = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x8x36x62xf32>, tensor<4x4xf32>) outs([[FILL]] : tensor<1x5x33x62xf32>)
-  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 33, 62]
-  // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[POOL]] : tensor<1x5x33x62xf32>) outs([[INIT]] : tensor<1x5x33x62xf32>)
-  // CHECK:   [[ZERO:%.0]] = arith.constant 0
-  // CHECK:   [[ONE:%.+]] = arith.constant 1
-  // CHECK:   [[HEIGHT:%.+]] = arith.constant 4
-  // CHECK:   [[WIDTH:%.+]] = arith.constant 32
-  // CHECK:   [[IDX1:%.+]] = linalg.index 1
-  // CHECK:   [[IDX2:%.+]] = linalg.index 2
-
-  // The large block below computes what portion of the kernel is within non-padded input.
-  // CHECK:   [[NY:%.+]] = arith.subi [[HEIGHT]], [[IDX1]]
-  // CHECK:   [[NX:%.+]] = arith.subi [[WIDTH]], [[IDX2]]
-  // CHECK:   [[KH:%.+]] = arith.constant 4
-  // CHECK:   [[PAD0:%.+]] = arith.constant 1
-  // CHECK:   [[SUBP0:%.+]] = arith.subi [[IDX1]], [[PAD0]]
-  // CHECK:   [[P0CMP:%.+]] = arith.cmpi slt, [[SUBP0]], [[ZERO]]
-  // CHECK:   [[SELP0:%.+]] = select [[P0CMP]], [[SUBP0]], [[ZERO]]
-  // CHECK:   [[ADDP0:%.+]] = arith.addi [[KH]], [[SELP0]]
-  // CHECK:   [[PAD1:%.+]] = arith.constant 1
-  // CHECK:   [[SUBP1:%.+]] = arith.subi [[NY]], [[PAD1]]
-  // CHECK:   [[P1CMP:%.+]] = arith.cmpi slt, [[SUBP1]], [[ZERO]]
-  // CHECK:   [[SELP1:%.+]] = select [[P1CMP]], [[SUBP1]], [[ZERO]]
-  // CHECK:   [[ADDP1:%.+]] = arith.addi [[ADDP0]], [[SELP1]]
-  // CHECK:   [[YCMP:%.+]] = arith.cmpi slt, [[ADDP1]], [[ONE]]
-  // CHECK:   [[YSEL:%.+]] = select [[YCMP]], [[ONE]], [[ADDP1]]
-  // CHECK:   [[KW:%.+]] = arith.constant 4 : index
-  // CHECK:   [[PAD2:%.+]] = arith.constant 1 : index
-  // CHECK:   [[SUBP2:%.+]] = arith.subi [[IDX2]], [[PAD2]]
-  // CHECK:   [[P2CMP:%.+]] = arith.cmpi slt, [[SUBP2]], [[ZERO]]
-  // CHECK:   [[SELP2:%.+]] = select [[P2CMP]], [[SUBP2]], [[ZERO]]
-  // CHECK:   [[ADDP2:%.+]] = arith.addi [[KW]], [[SELP2]]
-  // CHECK:   [[PAD3:%.+]] = arith.constant 1 : index
-  // CHECK:   [[SUBP3:%.+]] = arith.subi [[NX]], [[PAD3]]
-  // CHECK:   [[P3CMP:%.+]] = arith.cmpi slt, [[SUBP3]], [[ZERO]]
-  // CHECK:   [[SELP3:%.+]] = select [[P3CMP]], [[SUBP3]], [[ZERO]]
-  // CHECK:   [[ADDP3:%.+]] = arith.addi [[ADDP2]], [[SELP3]]
-  // CHECK:   [[XCMP:%.+]] = arith.cmpi slt, [[ADDP3]], [[ONE]]
-  // CHECK:   [[XSEL:%.+]] = select [[XCMP]], [[ONE]], [[ADDP3]]
-
-  // Given the valid coverage of the pooling region, normalize the summation.
-  // CHECK:   [[C:%.+]] = arith.muli [[YSEL]], [[XSEL]]
-  // CHECK:   [[CI:%.+]] = arith.index_cast [[C]]
-  // CHECK:   [[CF:%.+]] = arith.sitofp [[CI]]
-  // CHECK:   [[RESULT:%.+]] = arith.divf %arg1, [[CF]]
-  // CHECK:   linalg.yield [[RESULT]]
-  %0 = "tosa.avg_pool2d"(%arg0) {pad = [1, 1, 1, 1], kernel = [4, 4], stride = [1, 1]} : (tensor<1x6x34x62xf32>)  -> (tensor<1x5x33x62xf32>)
-  return %0 : tensor<1x5x33x62xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @avg_pool_i8
-func @avg_pool_i8(%arg0 : tensor<1x128x128x2xi8>) -> () {
-
-  // CHECK: linalg.pooling_nhwc_sum
-  // CHECK: linalg.generic
-
-  // CHECK: %[[INZP:.+]] = arith.constant -128
-  // CHECK: %[[INZP_OFF:.+]] = arith.muli %{{.+}}, %[[INZP]]
-  // CHECK: %[[OFFSETED:.+]] = arith.subi %arg1, %[[INZP_OFF]]
-  // CHECK: %[[NUMERATOR:.+]] = arith.constant 1073741825
-  // CHECK: %[[MULTIPLIER:.+]] = arith.divui %[[NUMERATOR]], %{{.+}}
-  // CHECK: %[[SHIFT:.+]] = arith.constant 30
-  // CHECK: %[[SCALE:.+]] = "tosa.apply_scale"(%{{.+}}, %[[MULTIPLIER]], %[[SHIFT]]) {double_round = false}
-  // CHECK: %[[OUTZP:.+]] = arith.constant -128
-  // CHECK: %[[OUT:.+]] = arith.addi %[[SCALE]], %[[OUTZP]]
-  // CHECK: %[[MIN:.+]] = arith.constant -128
-  // CHECK: %[[MAX:.+]] = arith.constant 127
-  // CHECK: %[[CMP_MIN:.+]] = arith.cmpi slt, %[[OUT]], %[[MIN]]
-  // CHECK: %[[CLMP_MIN:.+]] = select %[[CMP_MIN]], %[[MIN]], %[[OUT]]
-  // CHECK: %[[CMP_MAX:.+]] = arith.cmpi slt, %[[MAX]], %[[OUT]]
-  // CHECK: %[[CLMP_MAX:.+]] = select %[[CMP_MAX]], %[[MAX]], %[[CLMP_MIN]]
-  // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLMP_MAX]]
-  // CHECK: linalg.yield %[[TRUNC]]
-  %0 = "tosa.avg_pool2d"(%arg0) {kernel = [4, 4], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, output_zp = -128 : i32}, stride = [4, 4]} : (tensor<1x128x128x2xi8>) -> tensor<1x32x32x2xi8>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @avg_pool_i16
-func @avg_pool_i16(%arg0 : tensor<1x128x128x2xi16>) -> () {
-
-  // CHECK: linalg.pooling_nhwc_sum
-  // CHECK: linalg.generic
-
-  // CHECK: %[[INZP:.+]] = arith.constant -128
-  // CHECK: %[[INZP_OFF:.+]] = arith.muli %{{.+}}, %[[INZP]]
-  // CHECK: %[[OFFSETED:.+]] = arith.subi %arg1, %[[INZP_OFF]]
-  // CHECK: %[[NUMERATOR:.+]] = arith.constant 1073741825
-  // CHECK: %[[MULTIPLIER:.+]] = arith.divui %[[NUMERATOR]], %{{.+}}
-  // CHECK: %[[SHIFT:.+]] = arith.constant 30
-  // CHECK: %[[SCALE:.+]] = "tosa.apply_scale"(%{{.+}}, %[[MULTIPLIER]], %[[SHIFT]]) {double_round = false}
-  // CHECK: %[[OUTZP:.+]] = arith.constant -128
-  // CHECK: %[[OUT:.+]] = arith.addi %[[SCALE]], %[[OUTZP]]
-  // CHECK: %[[MIN:.+]] = arith.constant -32768
-  // CHECK: %[[MAX:.+]] = arith.constant 32767
-  // CHECK: %[[CMP_MIN:.+]] = arith.cmpi slt, %[[OUT]], %[[MIN]]
-  // CHECK: %[[CLMP_MIN:.+]] = select %[[CMP_MIN]], %[[MIN]], %[[OUT]]
-  // CHECK: %[[CMP_MAX:.+]] = arith.cmpi slt, %[[MAX]], %[[OUT]]
-  // CHECK: %[[CLMP_MAX:.+]] = select %[[CMP_MAX]], %[[MAX]], %[[CLMP_MIN]]
-  // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLMP_MAX]]
-  // CHECK: linalg.yield %[[TRUNC]]
-  %0 = "tosa.avg_pool2d"(%arg0) {kernel = [4, 4], pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, output_zp = -128 : i32}, stride = [4, 4]} : (tensor<1x128x128x2xi16>) -> tensor<1x32x32x2xi16>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>
-// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
-
-// CHECK-LABEL: @conv2d_f32
-func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
-  // CHECK: %[[W_IN:.+]] = linalg.init_tensor [3, 3, 27, 28]
-  // CHECK: %[[W:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[W_IN]] : tensor<3x3x27x28xf32>)
-  // CHECK:   linalg.yield %arg3 : f32
-  // CHECK: %[[M_IN:.+]] = linalg.init_tensor [1, 45, 40, 28]
-  // CHECK: %[[CST:.+]] = arith.constant 0
-  // CHECK: %[[FILL:.+]] = linalg.fill
-  // CHECK: %[[B_IN:.+]] = linalg.init_tensor [1, 45, 40, 28]
-  // CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[W]] : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%[[FILL]] : tensor<1x45x40x28xf32>)
-  // CHECK: %[[B:.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[CONV]] : tensor<28xf32>, tensor<1x45x40x28xf32>) outs(%[[B_IN]] : tensor<1x45x40x28xf32>)
-  // CHECK:   arith.addf
-  // CHECK:   linalg.yield %7 : f32
-  %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [0, 0, 0, 0], stride = [1, 1], dilation = [2, 1]} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>)  -> (tensor<1x45x40x28xf32>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @conv2d_padded_f32
-func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () {
-  // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
-  // CHECK:   linalg.yield %[[C0]]
-  // CHECK: linalg.conv_2d_nhwc_hwcf
-  %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [1, 1, 1, 1], stride = [1, 1], dilation = [2, 1]} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>)  -> (tensor<1x45x40x28xf32>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @conv2d_quant
-func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1xi8>, %arg2 : tensor<1024xi32>) -> () {
-  // CHECK:   %[[C22:.+]] = arith.constant -22
-  // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
-  // CHECK:   linalg.yield %[[C22]]
-  // CHECK: linalg.conv_2d_nhwc_hwcf_q
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = -22 : i32, weight_zp = 42 : i32}, stride = [1, 1]} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-
-// CHECK-LABEL: @depthwise_conv
-func @depthwise_conv(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () {
-  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 3, 11]
-  // CHECK: [[CST0:%.+]] = arith.constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
-  // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 5, 5, 33]
-  // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>)
-  // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
-  // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) {
-  // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):  // no predecessors
-  // CHECK:   [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32
-  // CHECK:   linalg.yield [[ADD]] : f32
-  // CHECK: } -> tensor<1x5x5x33xf32>
-  %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [1, 1], dilation = [1, 1] } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> (tensor<1x5x5x33xf32>)
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-
-// CHECK-LABEL: @depthwise_conv_strides
-func @depthwise_conv_strides(%arg0 : tensor<1x11x9x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () {
-  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 3, 11]
-  // CHECK: [[CST0:%.+]] = arith.constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
-  // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 5, 5, 33]
-  // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>)
-  // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
-  // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) {
-  // CHECK: ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):  // no predecessors
-  // CHECK:   [[ADD:%.+]] = arith.addf %arg3, %arg4 : f32
-  // CHECK:   linalg.yield [[ADD]] : f32
-  // CHECK: } -> tensor<1x5x5x33xf32>
-  %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [2, 2], dilation = [1, 1] } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> (tensor<1x5x5x33xf32>)
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-
-// CHECK-LABEL: @depthwise_conv_quant
-func @depthwise_conv_quant(%arg0 : tensor<1x12x12x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () {
-  // CHECK: [[PADV:%.+]] = arith.constant -128
-  // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
-  // CHECK:   linalg.yield [[PADV]]
-
-  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 12, 12, 4, 128]
-  // CHECK: [[CST0:%.+]] = arith.constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
-  // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 12, 12, 512]
-  // CHECK: [[C128:%.+]] = arith.constant -128
-  // CHECK: [[C42:%.+]] = arith.constant 42
-  // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins([[PAD]], %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x12x12x4x128xi32>)
-  // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
-  // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x12x12x512xi32>) outs([[OUT]] : tensor<1x12x12x512xi32>) {
-  // CHECK: ^bb0(%arg3: i32, %arg4: i32, %arg5: i32):  // no predecessors
-  // CHECK:   [[ADD:%.+]] = arith.addi %arg3, %arg4 : i32
-  // CHECK:   linalg.yield [[ADD]] : i32
-  // CHECK: } -> tensor<1x12x12x512xi32>
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = [1, 1, 1, 1], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1], dilation = [1, 1] } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x12x12x512xi32>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-
-// CHECK-LABEL: @depthwise_conv_quant_dilations
-func @depthwise_conv_quant_dilations(%arg0 : tensor<1x14x14x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () {
-  // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 10, 10, 4, 128]
-  // CHECK: [[CST0:%.+]] = arith.constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill([[CST0]], [[INIT]])
-  // CHECK: [[OUT:%.+]] = linalg.init_tensor [1, 10, 10, 512]
-  // CHECK: [[C128:%.+]] = arith.constant -128
-  // CHECK: [[C42:%.+]] = arith.constant 42
-  // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x10x10x4x128xi32>)
-  // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
-  // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x10x10x512xi32>) outs([[OUT]] : tensor<1x10x10x512xi32>) {
-  // CHECK: ^bb0(%arg3: i32, %arg4: i32, %arg5: i32):  // no predecessors
-  // CHECK:   [[ADD:%.+]] = arith.addi %arg3, %arg4 : i32
-  // CHECK:   linalg.yield [[ADD]] : i32
-  // CHECK: } -> tensor<1x10x10x512xi32>
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {pad = [0, 0, 0, 0], quantization_info = {input_zp = -128 : i32, weight_zp = 42 : i32}, stride = [1, 1], dilation = [2, 2] } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x10x10x512xi32>
-  return
-}
-
-// -----
-
 // CHECK-LABEL: @resize_nearest
 func @resize_nearest(%input: tensor<1x2x2x1xf32>) -> () {
   // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 4, 4, 1]
diff --git a/mlir/test/Dialect/Arithmetic/canonicalize.mlir b/mlir/test/Dialect/Arithmetic/canonicalize.mlir
--- a/mlir/test/Dialect/Arithmetic/canonicalize.mlir
+++ b/mlir/test/Dialect/Arithmetic/canonicalize.mlir
@@ -484,3 +484,65 @@
   %3 = arith.minui %arg0, %c0 : i8
   return %0, %1, %2, %3: i8, i8, i8, i8
 }
+
+// -----
+
+// CHECK-LABEL: @constant_FPtoUI(
+func @constant_FPtoUI() -> i32 {
+  // CHECK: %[[C0:.+]] = arith.constant 2 : i32
+  // CHECK: return %[[C0]]
+  %c0 = arith.constant 2.0 : f32
+  %res = arith.fptoui %c0 : f32 to i32
+  return %res : i32
+}
+
+// -----
+// CHECK-LABEL: @invalid_constant_FPtoUI(
+func @invalid_constant_FPtoUI() -> i32 {
+  // CHECK: %[[C0:.+]] = arith.constant -2.000000e+00 : f32
+  // CHECK: %[[C1:.+]] = arith.fptoui %[[C0]] : f32 to i32
+  // CHECK: return %[[C1]]
+  %c0 = arith.constant -2.0 : f32
+  %res = arith.fptoui %c0 : f32 to i32
+  return %res : i32
+}
+
+// -----
+// CHECK-LABEL: @constant_FPtoSI(
+func @constant_FPtoSI() -> i32 {
+  // CHECK: %[[C0:.+]] = arith.constant -2 : i32
+  // CHECK: return %[[C0]]
+  %c0 = arith.constant -2.0 : f32
+  %res = arith.fptosi %c0 : f32 to i32
+  return %res : i32
+}
+
+// -----
+// CHECK-LABEL: @invalid_constant_FPtoSI(
+func @invalid_constant_FPtoSI() -> i8 {
+  // CHECK: %[[C0:.+]] = arith.constant 2.000000e+10 : f32
+  // CHECK: %[[C1:.+]] = arith.fptosi %[[C0]] : f32 to i8
+  // CHECK: return %[[C1]]
+  %c0 = arith.constant 2.0e10 : f32
+  %res = arith.fptosi %c0 : f32 to i8
+  return %res : i8
+}
+
+// CHECK-LABEL: @constant_SItoFP(
+func @constant_SItoFP() -> f32 {
+  // CHECK: %[[C0:.+]] = arith.constant -2.000000e+00 : f32
+  // CHECK: return %[[C0]]
+  %c0 = arith.constant -2 : i32
+  %res = arith.sitofp %c0 : i32 to f32
+  return %res : f32
+}
+
+// -----
+// CHECK-LABEL: @constant_UItoFP(
+func @constant_UItoFP() -> f32 {
+  // CHECK: %[[C0:.+]] = arith.constant 2.000000e+00 : f32
+  // CHECK: return %[[C0]]
+  %c0 = arith.constant 2 : i32
+  %res = arith.sitofp %c0 : i32 to f32
+  return %res : f32
+}
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -146,6 +146,9 @@
 // CHECK: "llvm.intr.pow"(%[[FLOAT]], %[[FLOAT]]) : (f32, f32) -> f32
   %31 = "llvm.intr.pow"(%arg1, %arg1) : (f32, f32) -> f32
 
+// CHECK: "llvm.intr.powi"(%[[FLOAT]], %[[I32]]) : (f32, i32) -> f32
+  %a31 = "llvm.intr.powi"(%arg1, %arg0) : (f32, i32) -> f32
+
 // CHECK: "llvm.intr.bitreverse"(%{{.*}}) : (i32) -> i32
   %32 = "llvm.intr.bitreverse"(%arg0) : (i32) -> i32
 
diff --git a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir
--- a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir
@@ -17,17 +17,17 @@
 //      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
 //
 //      CHECK-1D: vector.transfer_read {{.*}} : memref<8x16xf32, #{{.*}}>, vector<8x16xf32>
-//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32, #{{.*}}>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
 //      CHECK-1D: vector.transfer_read {{.*}} : memref<16x12xf32, #{{.*}}>, vector<16x12xf32>
-//      CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32, #{{.*}}>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
 //      CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
-//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
+//      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
 //
 //      CHECK-1D: vector.contract
 // CHECK-1D-SAME:   iterator_types = ["parallel", "parallel", "reduction"]
 // CHECK-1D-SAME:   : vector<8x16xf32>, vector<16x12xf32> into vector<8x12xf32>
 //
-//      CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
+//      CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32>, vector<8x12xf32>
 //      CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
 
 // CHECK-2D-LABEL:func @matmul
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -2,13 +2,13 @@
 
 // CHECK-LABEL: func @subview_of_size_memcast
 //  CHECK-SAME:   %[[ARG0:.[a-z0-9A-Z_]+]]: memref<4x6x16x32xi8>
-//       CHECK:   %[[S:.+]] = memref.subview %[[ARG0]][0, 1, 0] [1, 1, 16] [1, 1, 1] : memref<4x6x16x32xi8> to memref<16x32xi8, #{{.*}}>
+//       CHECK:   %[[S:.+]] = memref.subview %[[ARG0]][0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : memref<4x6x16x32xi8> to memref<16x32xi8, #{{.*}}>
 //       CHECK:   %[[M:.+]] = memref.cast %[[S]] : memref<16x32xi8, #{{.*}}> to memref<16x32xi8, #{{.*}}>
 //       CHECK:   return %[[M]] : memref<16x32xi8, #{{.*}}>
 func @subview_of_size_memcast(%arg : memref<4x6x16x32xi8>) ->
   memref<16x32xi8, affine_map<(d0, d1)[s0] -> (d0 * 32 + d1 + s0)>>{
   %0 = memref.cast %arg : memref<4x6x16x32xi8> to memref<?x?x16x32xi8>
-  %1 = memref.subview %0[0, 1, 0] [1, 1, 16] [1, 1, 1] :
+  %1 = memref.subview %0[0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] :
     memref<?x?x16x32xi8> to
     memref<16x32xi8, affine_map<(d0, d1)[s0] -> (d0 * 32 + d1 + s0)>>
   return %1 : memref<16x32xi8, affine_map<(d0, d1)[s0] -> (d0 * 32 + d1 + s0)>>
@@ -450,3 +450,52 @@
   // CHECK-NEXT: return [[C2]]
   return %rank_0 : index
 }
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0 * 42 + d1)>
+func @fold_no_op_subview(%arg0 : memref<20x42xf32>) -> memref<20x42xf32, #map> {
+  %0 = memref.subview %arg0[0, 0] [20, 42] [1, 1] : memref<20x42xf32> to memref<20x42xf32, #map>
+  return %0 : memref<20x42xf32, #map>
+}
+// CHECK-LABEL: func @fold_no_op_subview(
+//       CHECK:   %[[ARG0:.+]]: memref<20x42xf32>)
+//       CHECK:   %[[CAST:.+]] = memref.cast %[[ARG0]]
+//       CHECK:   return %[[CAST]]
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0 * 42 + d1 + 1)>
+func @no_fold_subview_with_non_zero_offset(%arg0 : memref<20x42xf32>) -> memref<20x42xf32, #map> {
+  %0 = memref.subview %arg0[0, 1] [20, 42] [1, 1] : memref<20x42xf32> to memref<20x42xf32, #map>
+  return %0 : memref<20x42xf32, #map>
+}
+// CHECK-LABEL: func @no_fold_subview_with_non_zero_offset(
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview
+//       CHECK:    return %[[SUBVIEW]]
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0 * 42 + d1 * 2)>
+func @no_fold_subview_with_non_unit_stride(%arg0 : memref<20x42xf32>) -> memref<20x42xf32, #map> {
+  %0 = memref.subview %arg0[0, 0] [20, 42] [1, 2] : memref<20x42xf32> to memref<20x42xf32, #map>
+  return %0 : memref<20x42xf32, #map>
+}
+// CHECK-LABEL: func @no_fold_subview_with_non_unit_stride(
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview
+//       CHECK:    return %[[SUBVIEW]]
+
+// -----
+
+#map = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + d1 + s0)>
+func @no_fold_dynamic_no_op_subview(%arg0 : memref<?x?xf32>) -> memref<?x?xf32, #map> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %1 = memref.dim %arg0, %c1 : memref<?x?xf32>
+  %2 = memref.subview %arg0[0, 0] [%0, %1] [1, 1] : memref<?x?xf32> to memref<?x?xf32, #map>
+  return %2 : memref<?x?xf32, #map>
+}
+// CHECK-LABEL: func @no_fold_dynamic_no_op_subview(
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview
+//       CHECK:    return %[[SUBVIEW]]
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -149,7 +149,7 @@
 // -----
 
 func @memref_reinterpret_cast_too_many_offsets(%in: memref<?xf32>) {
-  // expected-error @+1 {{expected <= 1 offset values}}
+  // expected-error @+1 {{expected 1 offset values}}
   %out = memref.reinterpret_cast %in to
            offset: [0, 0], sizes: [10, 10], strides: [10, 1]
            : memref<?xf32> to memref<10x10xf32, offset: 0, strides: [10, 1]>
@@ -580,7 +580,7 @@
 
 func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
   %0 = memref.alloc() : memref<8x16x4xf32>
-  // expected-error@+1 {{expected <= 3 offset values}}
+  // expected-error@+1 {{expected 3 offset values}}
   %1 = memref.subview %0[%arg0, %arg1, 0, 0][%arg2, 0, 0, 0][1, 1, 1, 1]
     : memref<8x16x4xf32> to
       memref<8x?x4xf32, offset: 0, strides:[?, ?, 4]>
@@ -840,3 +840,11 @@
   "memref.rank"(%0): (f32)->index
   return
 }
+
+// -----
+
+#map = affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (s0 + d0 * s1 + d1 * s2 + d2 * s3)>
+func @illegal_num_offsets(%arg0 : memref<?x?x?xf32>, %arg1 : index, %arg2 : index) {
+  // expected-error@+1 {{expected 3 offset values}}
+  %0 = memref.subview %arg0[0, 0] [%arg1, %arg2] [1, 1] : memref<?x?x?xf32> to memref<?x?x?xf32, #map>
+}
diff --git a/mlir/test/Dialect/MemRef/subview.mlir b/mlir/test/Dialect/MemRef/subview.mlir
--- a/mlir/test/Dialect/MemRef/subview.mlir
+++ b/mlir/test/Dialect/MemRef/subview.mlir
@@ -109,12 +109,12 @@
 
   /// Subview with only leading operands.
   %24 = memref.alloc() : memref<5x3xf32>
-  // CHECK: memref.subview %{{.*}}[2] [3] [1] : memref<5x3xf32> to memref<3x3xf32, #[[$SUBVIEW_MAP9]]>
-  %25 = memref.subview %24[2][3][1]: memref<5x3xf32> to memref<3x3xf32, offset: 6, strides: [3, 1]>
+  // CHECK: memref.subview %{{.*}}[2, 0] [3, 3] [1, 1] : memref<5x3xf32> to memref<3x3xf32, #[[$SUBVIEW_MAP9]]>
+  %25 = memref.subview %24[2, 0][3, 3][1, 1]: memref<5x3xf32> to memref<3x3xf32, offset: 6, strides: [3, 1]>
 
   /// Rank-reducing subview with only leading operands.
-  // CHECK: memref.subview %{{.*}}[1] [1] [1] : memref<5x3xf32> to memref<3xf32, #[[$SUBVIEW_MAP10]]>
-  %26 = memref.subview %24[1][1][1]: memref<5x3xf32> to memref<3xf32, offset: 3, strides: [1]>
+  // CHECK: memref.subview %{{.*}}[1, 0] [1, 3] [1, 1] : memref<5x3xf32> to memref<3xf32, #[[$SUBVIEW_MAP10]]>
+  %26 = memref.subview %24[1, 0][1, 3][1, 1]: memref<5x3xf32> to memref<3xf32, offset: 3, strides: [1]>
 
   // Corner-case of 0-D rank-reducing subview with an offset.
   // CHECK: memref.subview %{{.*}}[1, 1] [1, 1] [1, 1] : memref<5x3xf32> to memref<f32, #[[$SUBVIEW_MAP11]]>
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -395,13 +395,13 @@
 
 // CHECK-LABEL: func @rank_reducing_tensor_of_cast
 //  CHECK-SAME:   %[[ARG0:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
-//       CHECK:   %[[S:.+]] = tensor.extract_slice %arg0[0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<4x6x16x32xi8> to tensor<16x32xi8>
+//       CHECK:   %[[S:.+]] = tensor.extract_slice %arg0[0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<4x6x16x32xi8> to tensor<16x32xi8>
 // Tensor cast is moved after slice and then gets canonicalized away.
 //   CHECK-NOT:   tensor.cast
 //       CHECK:   return %[[S]] : tensor<16x32xi8>
 func @rank_reducing_tensor_of_cast(%arg : tensor<4x6x16x32xi8>) -> tensor<16x32xi8> {
   %0 = tensor.cast %arg : tensor<4x6x16x32xi8> to tensor<?x?x16x32xi8>
-  %1 = tensor.extract_slice %0[0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<?x?x16x32xi8> to tensor<16x32xi8>
+  %1 = tensor.extract_slice %0[0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<?x?x16x32xi8> to tensor<16x32xi8>
   return %1 : tensor<16x32xi8>
 }
 
@@ -410,7 +410,7 @@
 // CHECK-LABEL: func @rank_reducing_insert_slice_of_cast
 //  CHECK-SAME:   %[[A:.[a-z0-9A-Z_]+]]: tensor<16x32xi8>
 //  CHECK-SAME:   %[[B:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
-//       CHECK:   %[[S:.+]] = tensor.insert_slice %[[A]] into %[[B]][0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<16x32xi8> into tensor<4x6x16x32xi8>
+//       CHECK:   %[[S:.+]] = tensor.insert_slice %[[A]] into %[[B]][0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<16x32xi8> into tensor<4x6x16x32xi8>
 // Tensor cast is folded away.
 //   CHECK-NOT:   tensor.cast
 //       CHECK:   return %[[S]] : tensor<4x6x16x32xi8>
@@ -418,7 +418,7 @@
   %c0 = arith.constant 0: index
   %cast = tensor.cast %a : tensor<16x32xi8> to tensor<?x32xi8>
   %sz = tensor.dim %cast, %c0: tensor<?x32xi8>
-  %res = tensor.insert_slice %cast into %b[0, 1, 0] [1, 1, %sz] [1, 1, 1] : tensor<?x32xi8> into tensor<4x6x16x32xi8>
+  %res = tensor.insert_slice %cast into %b[0, 1, 0, 0] [1, 1, %sz, 32] [1, 1, 1, 1] : tensor<?x32xi8> into tensor<4x6x16x32xi8>
   return %res : tensor<4x6x16x32xi8>
 }
 
diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir
--- a/mlir/test/Dialect/Tensor/invalid.mlir
+++ b/mlir/test/Dialect/Tensor/invalid.mlir
@@ -300,3 +300,20 @@
   "tensor.rank"(%0): (f32)->index
   return
 }
+
+// -----
+
+func @illegal_num_offsets(%arg0 : tensor<?x?x?xf32>, %arg1 : index, %arg2 : index) {
+  // expected-error@+1 {{expected 3 offset values}}
+  %0 = tensor.extract_slice %arg0[0, 0] [%arg1, %arg2] [1, 1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
+  return
+}
+
+// -----
+
+func @illegal_num_offsets(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?x?xf32>,
+    %arg2 : index, %arg3 : index) {
+  // expected-error@+1 {{expected 3 offset values}}
+  %0 = tensor.insert_slice %arg0 into %arg1[0, 0] [%arg2, %arg3] [1, 1] : tensor<?x?xf32> into tensor<?x?x?xf32>
+  return
+}
diff --git a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
@@ -1,7 +1,5 @@
 // RUN: mlir-opt %s -test-vector-transfer-drop-unit-dims-patterns -split-input-file | FileCheck %s
 
-// -----
-
 func @transfer_read_rank_reducing(
       %arg : memref<1x1x3x2xi8, offset:?, strides:[6, 6, 2, 1]>) -> vector<3x2xi8> {
     %c0 = arith.constant 0 : index
diff --git a/mlir/test/Integration/Dialect/Standard/CPU/test_subview.mlir b/mlir/test/Integration/Dialect/Standard/CPU/test_subview.mlir
--- a/mlir/test/Integration/Dialect/Standard/CPU/test_subview.mlir
+++ b/mlir/test/Integration/Dialect/Standard/CPU/test_subview.mlir
@@ -13,7 +13,7 @@
   %0 = memref.get_global @__constant_5x3xf32 : memref<5x3xf32>
 
   /// Subview with only leading operands.
-  %1 = memref.subview %0[2][3][1]: memref<5x3xf32> to memref<3x3xf32, offset: 6, strides: [3, 1]>
+  %1 = memref.subview %0[2, 0][3, 3][1, 1]: memref<5x3xf32> to memref<3x3xf32, offset: 6, strides: [3, 1]>
   %unranked = memref.cast %1 : memref<3x3xf32, offset: 6, strides: [3, 1]> to memref<*xf32>
   call @print_memref_f32(%unranked) : (memref<*xf32>) -> ()
 
@@ -50,7 +50,7 @@
   // CHECK-NEXT: [2,  5,  8,  11,  14]
 
   /// Rank-reducing subview with only leading operands.
-  %4 = memref.subview %0[1][1][1]: memref<5x3xf32> to memref<3xf32, offset: 3, strides: [1]>
+  %4 = memref.subview %0[1, 0][1, 3][1, 1]: memref<5x3xf32> to memref<3xf32, offset: 3, strides: [1]>
   %unranked4 = memref.cast %4 : memref<3xf32, offset: 3, strides: [1]> to memref<*xf32>
   call @print_memref_f32(%unranked4) : (memref<*xf32>) -> ()
   //      CHECK: Unranked Memref base@ = {{0x[-9a-f]*}}
diff --git a/mlir/test/Target/LLVMIR/openacc-llvm.mlir b/mlir/test/Target/LLVMIR/openacc-llvm.mlir
--- a/mlir/test/Target/LLVMIR/openacc-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openacc-llvm.mlir
@@ -24,7 +24,7 @@
 // CHECK: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 
 // CHECK: [[LOCSTR:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};testenterdataop;{{[0-9]*}};{{[0-9]*}};;\00", align 1
-// CHECK: [[LOCGLOBAL:@.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]*}} x i8], [{{[0-9]*}} x i8]* [[LOCSTR]], i32 0, i32 0) }, align 8
+// CHECK: [[LOCGLOBAL:@.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 {{[0-9]*}}, i8* getelementptr inbounds ([{{[0-9]*}} x i8], [{{[0-9]*}} x i8]* [[LOCSTR]], i32 0, i32 0) }, align 8
 // CHECK: [[MAPNAME1:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};unknown;{{[0-9]*}};{{[0-9]*}};;\00", align 1
 // CHECK: [[MAPNAME2:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};unknown;{{[0-9]*}};{{[0-9]*}};;\00", align 1
 // CHECK: [[MAPTYPES:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i64] [i64 0, i64 1]
@@ -84,7 +84,7 @@
 // CHECK: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 
 // CHECK: [[LOCSTR:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};testexitdataop;{{[0-9]*}};{{[0-9]*}};;\00", align 1
-// CHECK: [[LOCGLOBAL:@.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]*}} x i8], [{{[0-9]*}} x i8]* [[LOCSTR]], i32 0, i32 0) }, align 8
+// CHECK: [[LOCGLOBAL:@.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 {{[0-9]*}}, i8* getelementptr inbounds ([{{[0-9]*}} x i8], [{{[0-9]*}} x i8]* [[LOCSTR]], i32 0, i32 0) }, align 8
 // CHECK: [[MAPNAME1:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};unknown;{{[0-9]*}};{{[0-9]*}};;\00", align 1
 // CHECK: [[MAPNAME2:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};unknown;{{[0-9]*}};{{[0-9]*}};;\00", align 1
 // CHECK: [[MAPTYPES:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i64] [i64 8, i64 2]
@@ -143,7 +143,7 @@
 // CHECK: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 
 // CHECK: [[LOCSTR:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};testupdateop;{{[0-9]*}};{{[0-9]*}};;\00", align 1
-// CHECK: [[LOCGLOBAL:@.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]*}} x i8], [{{[0-9]*}} x i8]* [[LOCSTR]], i32 0, i32 0) }, align 8
+// CHECK: [[LOCGLOBAL:@.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 {{[0-9]*}}, i8* getelementptr inbounds ([{{[0-9]*}} x i8], [{{[0-9]*}} x i8]* [[LOCSTR]], i32 0, i32 0) }, align 8
 // CHECK: [[MAPNAME1:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};unknown;{{[0-9]*}};{{[0-9]*}};;\00", align 1
 // CHECK: [[MAPNAME2:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};unknown;{{[0-9]*}};{{[0-9]*}};;\00", align 1
 // CHECK: [[MAPTYPES:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i64] [i64 2, i64 1]
@@ -205,7 +205,7 @@
 
 // CHECK: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK: [[LOCSTR:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};testdataop;{{[0-9]*}};{{[0-9]*}};;\00", align 1
-// CHECK: [[LOCGLOBAL:@.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]*}} x i8], [{{[0-9]*}} x i8]* [[LOCSTR]], i32 0, i32 0) }, align 8
+// CHECK: [[LOCGLOBAL:@.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 {{[0-9]*}}, i8* getelementptr inbounds ([{{[0-9]*}} x i8], [{{[0-9]*}} x i8]* [[LOCSTR]], i32 0, i32 0) }, align 8
 // CHECK: [[MAPNAME1:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};unknown;{{[0-9]*}};{{[0-9]*}};;\00", align 1
 // CHECK: [[MAPNAME2:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i8] c";{{.*}};unknown;{{[0-9]*}};{{[0-9]*}};;\00", align 1
 // CHECK: [[MAPTYPES:@.*]] = private unnamed_addr constant [{{[0-9]*}} x i64] [i64 8195, i64 8194]
diff --git a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
--- a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
@@ -476,24 +476,23 @@
   EXPECT_TRUE(simplex.isRedundantEquality({-1, 0, 2})); // x = 2.
 }
 
-static FlatAffineConstraints parseFAC(StringRef str, MLIRContext *context) {
-  FailureOr<FlatAffineConstraints> fac = parseIntegerSetToFAC(str, context);
+static IntegerPolyhedron parsePoly(StringRef str, MLIRContext *context) {
+  FailureOr<IntegerPolyhedron> poly = parseIntegerSetToFAC(str, context);
 
-  EXPECT_TRUE(succeeded(fac));
+  EXPECT_TRUE(succeeded(poly));
 
-  return *fac;
+  return *poly;
 }
 
 TEST(SimplexTest, IsRationalSubsetOf) {
 
   MLIRContext context;
 
-  FlatAffineConstraints univ = FlatAffineConstraints::getUniverse(1, 0);
-  FlatAffineConstraints empty =
-      parseFAC("(x) : (x + 0 >= 0, -x - 1 >= 0)", &context);
-  FlatAffineConstraints s1 = parseFAC("(x) : ( x >= 0, -x + 4 >= 0)", &context);
-  FlatAffineConstraints s2 =
-      parseFAC("(x) : (x - 1 >= 0, -x + 3 >= 0)", &context);
+  IntegerPolyhedron univ = parsePoly("(x) : ()", &context);
+  IntegerPolyhedron empty =
+      parsePoly("(x) : (x + 0 >= 0, -x - 1 >= 0)", &context);
+  IntegerPolyhedron s1 = parsePoly("(x) : ( x >= 0, -x + 4 >= 0)", &context);
+  IntegerPolyhedron s2 = parsePoly("(x) : (x - 1 >= 0, -x + 3 >= 0)", &context);
 
   Simplex simUniv(univ);
   Simplex simEmpty(empty);
diff --git a/mlir/unittests/CMakeLists.txt b/mlir/unittests/CMakeLists.txt
--- a/mlir/unittests/CMakeLists.txt
+++ b/mlir/unittests/CMakeLists.txt
@@ -11,6 +11,7 @@
 add_subdirectory(Interfaces)
 add_subdirectory(IR)
 add_subdirectory(Pass)
+add_subdirectory(Support)
 add_subdirectory(Rewrite)
 add_subdirectory(TableGen)
 add_subdirectory(Transforms)
diff --git a/mlir/unittests/Support/CMakeLists.txt b/mlir/unittests/Support/CMakeLists.txt
--- a/mlir/unittests/Support/CMakeLists.txt
+++ b/mlir/unittests/Support/CMakeLists.txt
@@ -7,4 +7,4 @@
 )
 
 target_link_libraries(MLIRSupportTests
-  PRIVATE MLIRSupportIndentedOstream MLIRSupport)
+  PRIVATE MLIRSupport)
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -26,7 +26,9 @@
 ///{
 
 /// Add worst-case padding so that future allocations are properly aligned.
-constexpr const uint32_t Alignment = 8;
+/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
+/// passed in as an argument and the stack rewritten to support it.
+constexpr const uint32_t Alignment = 16;
 
 /// External symbol to access dynamic shared memory.
 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/include/device.h
rename from openmp/libomptarget/src/device.h
rename to openmp/libomptarget/include/device.h
diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/include/rtl.h
rename from openmp/libomptarget/src/rtl.h
rename to openmp/libomptarget/include/rtl.h
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -114,21 +114,6 @@
   return OFFLOAD_SUCCESS;
 }
 
-int createEvent(void **P) {
-  CUevent Event = nullptr;
-
-  CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT);
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event));
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  *P = Event;
-
-  return OFFLOAD_SUCCESS;
-}
-
 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
@@ -157,19 +142,6 @@
   return OFFLOAD_SUCCESS;
 }
 
-int destroyEvent(void *EventPtr) {
-  CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-  CUresult Err = cuEventDestroy(Event);
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event));
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
 // Structure contains per-device data
 struct DeviceDataTy {
   /// List that contains all the kernels.
@@ -187,137 +159,158 @@
   int NumThreads = 0;
 };
 
-class StreamManagerTy {
-  int NumberOfDevices;
-  // The initial size of stream pool
-  int EnvNumInitialStreams;
-  // Per-device stream mutex
-  std::vector<std::unique_ptr<std::mutex>> StreamMtx;
-  // Per-device stream Id indicates the next available stream in the pool
-  std::vector<int> NextStreamId;
-  // Per-device stream pool
-  std::vector<std::vector<CUstream>> StreamPool;
-  // Reference to per-device data
-  std::vector<DeviceDataTy> &DeviceData;
-
-  // If there is no CUstream left in the pool, we will resize the pool to
-  // allocate more CUstream. This function should be called with device mutex,
-  // and we do not resize to smaller one.
-  void resizeStreamPool(const int DeviceId, const size_t NewSize) {
-    std::vector<CUstream> &Pool = StreamPool[DeviceId];
-    const size_t CurrentSize = Pool.size();
-    assert(NewSize > CurrentSize && "new size is not larger than current size");
+/// Resource allocator where \p T is the resource type.
+/// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
+/// accordingly. The implementation should not raise any exception.
+template <typename T> class AllocatorTy {
+public:
+  /// Create a resource and assign to R.
+  int create(T &R) noexcept;
+  /// Destroy the resource.
+  int destroy(T) noexcept;
+};
 
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
-      // We will return if cannot switch to the right context in case of
-      // creating bunch of streams that are not corresponding to the right
-      // device. The offloading will fail later because selected CUstream is
-      // nullptr.
-      return;
-    }
+/// Allocator for CUstream.
+template <> class AllocatorTy<CUstream> {
+  CUcontext Context;
 
-    Pool.resize(NewSize, nullptr);
+public:
+  AllocatorTy(CUcontext C) noexcept : Context(C) {}
 
-    for (size_t I = CurrentSize; I < NewSize; ++I) {
-      checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
-                  "Error returned from cuStreamCreate\n");
-    }
+  /// See AllocatorTy<T>::create.
+  int create(CUstream &Stream) noexcept {
+    if (!checkResult(cuCtxSetCurrent(Context),
+                     "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
+                     "Error returned from cuStreamCreate\n"))
+      return OFFLOAD_FAIL;
+
+    return OFFLOAD_SUCCESS;
   }
 
-public:
-  StreamManagerTy(const int NumberOfDevices,
-                  std::vector<DeviceDataTy> &DeviceData)
-      : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
-        DeviceData(DeviceData) {
-    StreamPool.resize(NumberOfDevices);
-    NextStreamId.resize(NumberOfDevices);
-    StreamMtx.resize(NumberOfDevices);
+  /// See AllocatorTy<T>::destroy.
+  int destroy(CUstream Stream) noexcept {
+    if (!checkResult(cuCtxSetCurrent(Context),
+                     "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+    if (!checkResult(cuStreamDestroy(Stream),
+                     "Error returned from cuStreamDestroy\n"))
+      return OFFLOAD_FAIL;
 
-    if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
-      EnvNumInitialStreams = std::stoi(EnvStr);
+    return OFFLOAD_SUCCESS;
+  }
+};
 
-    // Initialize the next stream id
-    std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
+/// Allocator for CUevent.
+template <> class AllocatorTy<CUevent> {
+public:
+  /// See AllocatorTy<T>::create.
+  int create(CUevent &Event) noexcept {
+    if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
+                     "Error returned from cuEventCreate\n"))
+      return OFFLOAD_FAIL;
 
-    // Initialize stream mutex
-    for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
-      Ptr = std::make_unique<std::mutex>();
+    return OFFLOAD_SUCCESS;
   }
 
-  ~StreamManagerTy() {
-    // Destroy streams
-    for (int I = 0; I < NumberOfDevices; ++I) {
-      checkResult(cuCtxSetCurrent(DeviceData[I].Context),
-                  "Error returned from cuCtxSetCurrent\n");
+  /// See AllocatorTy<T>::destroy.
+  int destroy(CUevent Event) noexcept {
+    if (!checkResult(cuEventDestroy(Event),
+                     "Error returned from cuEventDestroy\n"))
+      return OFFLOAD_FAIL;
 
-      for (CUstream &S : StreamPool[I]) {
-        if (S)
-          checkResult(cuStreamDestroy(S),
-                      "Error returned from cuStreamDestroy\n");
-      }
-    }
+    return OFFLOAD_SUCCESS;
   }
+};
 
-  // Get a CUstream from pool. Per-device next stream id always points to the
-  // next available CUstream. That means, CUstreams [0, id-1] have been
-  // assigned, and [id,] are still available. If there is no CUstream left, we
-  // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
-  // the id will increase one.
-  // xxxxxs+++++++++
-  //      ^
-  //      id
-  // After assignment, the pool becomes the following and s is assigned.
-  // xxxxxs+++++++++
-  //       ^
-  //       id
-  CUstream getStream(const int DeviceId) {
-    const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
-    int &Id = NextStreamId[DeviceId];
-    // No CUstream left in the pool, we need to request from CUDA RT
-    if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
-      // By default we double the stream pool every time
-      resizeStreamPool(DeviceId, Id * 2);
+/// A generic pool of resources where \p T is the resource type.
+/// \p T should be copyable as the object is stored in \p std::vector .
+template <typename T> class ResourcePoolTy {
+  /// Index of the next available resource.
+  size_t Next = 0;
+  /// Mutex to guard the pool.
+  std::mutex Mutex;
+  /// Pool of resources.
+  std::vector<T> Resources;
+  /// A reference to the corresponding allocator.
+  AllocatorTy<T> Allocator;
+
+  /// If `Resources` is used up, we will fill in more resources. It assumes that
+  /// the new size `Size` should be always larger than the current size.
+  bool resize(size_t Size) {
+    auto CurSize = Resources.size();
+    assert(Size > CurSize && "Unexpected smaller size");
+    Resources.reserve(Size);
+    for (auto I = CurSize; I < Size; ++I) {
+      T NewItem;
+      int Ret = Allocator.create(NewItem);
+      if (Ret != OFFLOAD_SUCCESS)
+        return false;
+      Resources.push_back(NewItem);
     }
-    return StreamPool[DeviceId][Id++];
+    return true;
   }
 
-  // Return a CUstream back to pool. As mentioned above, per-device next
-  // stream is always points to the next available CUstream, so when we return
-  // a CUstream, we need to first decrease the id, and then copy the CUstream
-  // back.
-  // It is worth noting that, the order of streams return might be different
-  // from that they're assigned, that saying, at some point, there might be
-  // two identical CUstreams.
-  // xxax+a+++++
-  //     ^
-  //     id
-  // However, it doesn't matter, because they're always on the two sides of
-  // id. The left one will in the end be overwritten by another CUstream.
-  // Therefore, after several execution, the order of pool might be different
-  // from its initial state.
-  void returnStream(const int DeviceId, CUstream Stream) {
-    const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
-    int &Id = NextStreamId[DeviceId];
-    assert(Id > 0 && "Wrong stream ID");
-    StreamPool[DeviceId][--Id] = Stream;
+public:
+  ResourcePoolTy(AllocatorTy<T> &&A, size_t Size = 0) noexcept
+      : Allocator(std::move(A)) {
+    if (Size)
+      (void)resize(Size);
   }
 
-  bool initializeDeviceStreamPool(const int DeviceId) {
-    assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
+  ~ResourcePoolTy() noexcept { clear(); }
+
+  /// Get a resource from pool. `Next` always points to the next available
+  /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
+  /// still available. If there is no resource left, we will ask for more. Each
+  /// time a resource is assigned, the id will increase one.
+  /// xxxxxs+++++++++
+  ///      ^
+  ///      Next
+  /// After assignment, the pool becomes the following and s is assigned.
+  /// xxxxxs+++++++++
+  ///       ^
+  ///       Next
+  int acquire(T &R) noexcept {
+    std::lock_guard<std::mutex> LG(Mutex);
+    if (Next == Resources.size()) {
+      auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
+      if (!resize(NewSize))
+        return OFFLOAD_FAIL;
+    }
 
-    resizeStreamPool(DeviceId, EnvNumInitialStreams);
+    assert(Next < Resources.size());
 
-    // Check the size of stream pool
-    if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
-      return false;
+    R = Resources[Next++];
 
-    // Check whether each stream is valid
-    for (CUstream &S : StreamPool[DeviceId])
-      if (!S)
-        return false;
+    return OFFLOAD_SUCCESS;
+  }
 
-    return true;
+  /// Return the resource back to the pool. When we return a resource, we need
+  /// to first decrease `Next`, and then copy the resource back. It is worth
+  /// noting that, the order of resources return might be different from that
+  /// they're assigned, that saying, at some point, there might be two identical
+  /// resources.
+  /// xxax+a+++++
+  ///     ^
+  ///     Next
+  /// However, it doesn't matter, because they're always on the two sides of
+  /// `Next`. The left one will in the end be overwritten by another resource.
+  /// Therefore, after several execution, the order of pool might be different
+  /// from its initial state.
+  void release(T R) noexcept {
+    std::lock_guard<std::mutex> LG(Mutex);
+    Resources[--Next] = R;
+  }
+
+  /// Released all stored resources and clear the pool.
+  /// Note: This function is not thread safe. Be sure to guard it if necessary.
+  void clear() noexcept {
+    for (auto &R : Resources)
+      (void)Allocator.destroy(R);
+    Resources.clear();
   }
 };
 
@@ -331,13 +324,19 @@
   int64_t RequiresFlags;
   // Amount of dynamic shared memory to use at launch.
   uint64_t DynamicMemorySize;
+  // Number of initial streams for each device.
+  int NumInitialStreams = 32;
 
   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
   static constexpr const int HardThreadLimit = 1024;
   static constexpr const int DefaultNumTeams = 128;
   static constexpr const int DefaultNumThreads = 128;
 
-  std::unique_ptr<StreamManagerTy> StreamManager;
+  using StreamPoolTy = ResourcePoolTy<CUstream>;
+  std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
+
+  ResourcePoolTy<CUevent> EventPool;
+
   std::vector<DeviceDataTy> DeviceData;
   std::vector<CUmodule> Modules;
 
@@ -471,8 +470,13 @@
   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
     assert(AsyncInfo && "AsyncInfo is nullptr");
 
-    if (!AsyncInfo->Queue)
-      AsyncInfo->Queue = StreamManager->getStream(DeviceId);
+    if (!AsyncInfo->Queue) {
+      CUstream S;
+      if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
+        return nullptr;
+
+      AsyncInfo->Queue = S;
+    }
 
     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
   }
@@ -485,7 +489,7 @@
   DeviceRTLTy()
       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
-        DynamicMemorySize(0) {
+        DynamicMemorySize(0), EventPool(AllocatorTy<CUevent>()) {
 
     DP("Start initializing CUDA\n");
 
@@ -509,6 +513,7 @@
     }
 
     DeviceData.resize(NumberOfDevices);
+    StreamPool.resize(NumberOfDevices);
 
     // Get environment variables regarding teams
     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
@@ -532,9 +537,11 @@
       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
          DynamicMemorySize);
     }
-
-    StreamManager =
-        std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
+    if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
+      // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
+      NumInitialStreams = std::stoi(EnvStr);
+      DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
+    }
 
     for (int I = 0; I < NumberOfDevices; ++I)
       DeviceAllocators.emplace_back(I, DeviceData);
@@ -556,13 +563,16 @@
     for (auto &M : MemoryManagers)
       M.release();
 
-    StreamManager = nullptr;
-
     for (CUmodule &M : Modules)
       // Close module
       if (M)
         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
 
+    for (auto &S : StreamPool)
+      S.reset();
+
+    EventPool.clear();
+
     for (DeviceDataTy &D : DeviceData) {
       // Destroy context
       if (D.Context) {
@@ -627,8 +637,10 @@
       return OFFLOAD_FAIL;
 
     // Initialize stream pool
-    if (!StreamManager->initializeDeviceStreamPool(DeviceId))
-      return OFFLOAD_FAIL;
+    if (!StreamPool[DeviceId])
+      StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(
+          AllocatorTy<CUstream>(DeviceData[DeviceId].Context),
+          NumInitialStreams);
 
     // Query attributes to determine number of threads/block and blocks/grid.
     int MaxGridDimX;
@@ -1195,8 +1207,7 @@
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
     // own tasks.
-    StreamManager->returnStream(DeviceId,
-                                reinterpret_cast<CUstream>(AsyncInfo->Queue));
+    StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
     AsyncInfo->Queue = nullptr;
 
     if (Err != CUDA_SUCCESS) {
@@ -1382,6 +1393,19 @@
     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
   }
 
+  int createEvent(void **P) {
+    CUevent Event = nullptr;
+    if (EventPool.acquire(Event) != OFFLOAD_SUCCESS)
+      return OFFLOAD_FAIL;
+    *P = Event;
+    return OFFLOAD_SUCCESS;
+  }
+
+  int destroyEvent(void *EventPtr) {
+    EventPool.release(reinterpret_cast<CUevent>(EventPtr));
+    return OFFLOAD_SUCCESS;
+  }
+
   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
                 void *EventPtr) const {
     CUstream Stream = getStream(DeviceId, AsyncInfo);
@@ -1607,7 +1631,7 @@
 
 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
   assert(event && "event is nullptr");
-  return createEvent(event);
+  return DeviceRTL.createEvent(event);
 }
 
 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
@@ -1637,7 +1661,7 @@
 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
   assert(event_ptr && "event is nullptr");
 
-  return destroyEvent(event_ptr);
+  return DeviceRTL.destroyEvent(event_ptr);
 }
 
 #ifdef __cplusplus