diff --git a/clang-tools-extra/docs/clang-rename.rst b/clang-tools-extra/docs/clang-rename.rst
--- a/clang-tools-extra/docs/clang-rename.rst
+++ b/clang-tools-extra/docs/clang-rename.rst
@@ -16,7 +16,7 @@
 
 The tool is in a very early development stage, so you might encounter bugs and
 crashes. Submitting reports with information about how to reproduce the issue
-to `the LLVM bugtracker <https://llvm.org/bugs>`_ will definitely help the
+to `the LLVM bugtracker <https://bugs.llvm.org>`_ will definitely help the
 project. If you have any ideas or suggestions, you might want to put a feature
 request there.
 
diff --git a/clang/docs/LibASTImporter.rst b/clang/docs/LibASTImporter.rst
--- a/clang/docs/LibASTImporter.rst
+++ b/clang/docs/LibASTImporter.rst
@@ -119,7 +119,7 @@
   llvm::Expected<Decl *> ImportedOrErr = Importer.Import(From);
 
 The ``Import`` call returns with ``llvm::Expected``, so, we must check for any error.
-Please refer to the `error handling <http://llvm.org/docs/ProgrammersManual.html#recoverable-errors>`_ documentation for details.
+Please refer to the `error handling <https://llvm.org/docs/ProgrammersManual.html#recoverable-errors>`_ documentation for details.
 
 .. code-block:: cpp
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -304,7 +304,7 @@
 
 A wide variety of additional information is available on the `Clang web
 page <https://clang.llvm.org/>`_. The web page contains versions of the
-API documentation which are up-to-date with the Subversion version of
+API documentation which are up-to-date with the Git version of
 the source code. You can access versions of these documents specific to
 this release by going into the "``clang/docs/``" directory in the Clang
 tree.
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -2199,9 +2199,9 @@
 alpha.unix.SimpleStream (C)
 """""""""""""""""""""""""""
 Check for misuses of stream APIs. Check for misuses of stream APIs: ``fopen, fclose``
-(demo checker, the subject of the demo (`Slides <http://llvm.org/devmtg/2012-11/Zaks-Rose-Checker24Hours.pdf>`_ ,
+(demo checker, the subject of the demo (`Slides <https://llvm.org/devmtg/2012-11/Zaks-Rose-Checker24Hours.pdf>`_ ,
 `Video <https://youtu.be/kdxlsP5QVPw>`_) by Anna Zaks and Jordan Rose presented at the
-`2012 LLVM Developers' Meeting <http://llvm.org/devmtg/2012-11/>`_).
+`2012 LLVM Developers' Meeting <https://llvm.org/devmtg/2012-11/>`_).
 
 .. code-block:: c
 
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -7060,10 +7060,10 @@
   llvm::SmallVector<OMPTraitSet, 4> Sets;
 
   bool anyScoreOrCondition(
-      const llvm::function_ref<bool(Expr *&, bool /* IsScore */)> &Cond) {
-    return llvm::any_of(Sets, [&Cond](OMPTraitInfo::OMPTraitSet &Set) {
+      llvm::function_ref<bool(Expr *&, bool /* IsScore */)> Cond) {
+    return llvm::any_of(Sets, [&](OMPTraitInfo::OMPTraitSet &Set) {
       return llvm::any_of(
-          Set.Selectors, [&Cond](OMPTraitInfo::OMPTraitSelector &Selector) {
+          Set.Selectors, [&](OMPTraitInfo::OMPTraitSelector &Selector) {
             return Cond(Selector.ScoreOrCondition,
                         /* IsScore */ Selector.Kind !=
                             llvm::omp::TraitSelector::user_condition);
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -295,6 +295,12 @@
       const llvm::opt::DerivedArgList &Args, bool SameTripleAsHost,
       SmallVectorImpl<llvm::opt::Arg *> &AllocatedArgs) const;
 
+  /// Append the argument following \p A to \p DAL assuming \p A is an Xarch
+  /// argument.
+  virtual void TranslateXarchArgs(const llvm::opt::DerivedArgList &Args,
+                                  llvm::opt::Arg *&A,
+                                  llvm::opt::DerivedArgList *DAL) const;
+
   /// Choose a tool to use to handle the action \p JA.
   ///
   /// This can be overridden when a particular ToolChain needs to use
diff --git a/clang/lib/CodeGen/EHScopeStack.h b/clang/lib/CodeGen/EHScopeStack.h
--- a/clang/lib/CodeGen/EHScopeStack.h
+++ b/clang/lib/CodeGen/EHScopeStack.h
@@ -148,7 +148,7 @@
     virtual void anchor();
 
   protected:
-    virtual ~Cleanup() = default;
+    ~Cleanup() = default;
 
   public:
     Cleanup(const Cleanup &) = default;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1102,3 +1102,33 @@
   delete DAL;
   return nullptr;
 }
+
+void ToolChain::TranslateXarchArgs(const llvm::opt::DerivedArgList &Args,
+                                   llvm::opt::Arg *&A,
+                                   llvm::opt::DerivedArgList *DAL) const {
+  const OptTable &Opts = getDriver().getOpts();
+  unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
+  unsigned Prev = Index;
+  std::unique_ptr<llvm::opt::Arg> XarchArg(Opts.ParseOneArg(Args, Index));
+
+  // If the argument parsing failed or more than one argument was
+  // consumed, the -Xarch_ argument's parameter tried to consume
+  // extra arguments. Emit an error and ignore.
+  //
+  // We also want to disallow any options which would alter the
+  // driver behavior; that isn't going to work in our model. We
+  // use isDriverOption() as an approximation, although things
+  // like -O4 are going to slip through.
+  if (!XarchArg || Index > Prev + 1) {
+    getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
+        << A->getAsString(Args);
+    return;
+  } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
+    getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
+        << A->getAsString(Args);
+    return;
+  }
+  XarchArg->setBaseArg(A);
+  A = XarchArg.release();
+  DAL->AddSynthesizedArg(A);
+}
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -804,31 +804,7 @@
       // Skip this argument unless the architecture matches BoundArch
       if (BoundArch.empty() || A->getValue(0) != BoundArch)
         continue;
-
-      unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
-      unsigned Prev = Index;
-      std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
-
-      // If the argument parsing failed or more than one argument was
-      // consumed, the -Xarch_ argument's parameter tried to consume
-      // extra arguments. Emit an error and ignore.
-      //
-      // We also want to disallow any options which would alter the
-      // driver behavior; that isn't going to work in our model. We
-      // use isDriverOption() as an approximation, although things
-      // like -O4 are going to slip through.
-      if (!XarchArg || Index > Prev + 1) {
-        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
-            << A->getAsString(Args);
-        continue;
-      } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
-        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
-            << A->getAsString(Args);
-        continue;
-      }
-      XarchArg->setBaseArg(A);
-      A = XarchArg.release();
-      DAL->AddSynthesizedArg(A);
+      TranslateXarchArgs(Args, A, DAL);
     }
     DAL->append(A);
   }
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -2132,32 +2132,7 @@
         continue;
 
       Arg *OriginalArg = A;
-      unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
-      unsigned Prev = Index;
-      std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
-
-      // If the argument parsing failed or more than one argument was
-      // consumed, the -Xarch_ argument's parameter tried to consume
-      // extra arguments. Emit an error and ignore.
-      //
-      // We also want to disallow any options which would alter the
-      // driver behavior; that isn't going to work in our model. We
-      // use isDriverOption() as an approximation, although things
-      // like -O4 are going to slip through.
-      if (!XarchArg || Index > Prev + 1) {
-        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
-            << A->getAsString(Args);
-        continue;
-      } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
-        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
-            << A->getAsString(Args);
-        continue;
-      }
-
-      XarchArg->setBaseArg(A);
-
-      A = XarchArg.release();
-      DAL->AddSynthesizedArg(A);
+      TranslateXarchArgs(Args, A, DAL);
 
       // Linker input arguments require custom handling. The problem is that we
       // have already constructed the phase actions, so we can not treat them as
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -382,31 +382,7 @@
       // Skip this argument unless the architecture matches BoundArch.
       if (BoundArch.empty() || A->getValue(0) != BoundArch)
         continue;
-
-      unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
-      unsigned Prev = Index;
-      std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
-
-      // If the argument parsing failed or more than one argument was
-      // consumed, the -Xarch_ argument's parameter tried to consume
-      // extra arguments. Emit an error and ignore.
-      //
-      // We also want to disallow any options which would alter the
-      // driver behavior; that isn't going to work in our model. We
-      // use isDriverOption() as an approximation, although things
-      // like -O4 are going to slip through.
-      if (!XarchArg || Index > Prev + 1) {
-        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
-            << A->getAsString(Args);
-        continue;
-      } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
-        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
-            << A->getAsString(Args);
-        continue;
-      }
-      XarchArg->setBaseArg(A);
-      A = XarchArg.release();
-      DAL->AddSynthesizedArg(A);
+      TranslateXarchArgs(Args, A, DAL);
     }
     DAL->append(A);
   }
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -32,5 +32,7 @@
 # and libraries potentially draw from the components present in all
 # of the other directories.
 add_subdirectory(lib)
-add_subdirectory(test)
-add_subdirectory(fuzzing)
+if(LLVM_INCLUDE_TESTS)
+  add_subdirectory(test)
+  add_subdirectory(fuzzing)
+endif()
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -194,9 +194,16 @@
   }];
 }
 
+def SighandlerTDefn : TypeDecl<"__sighandler_t"> {
+  let Decl = [{
+    typedef void(*__sighandler_t)(int);
+  }];
+}
+
 def SignalAPI : PublicAPI<"signal.h"> {
   let TypeDeclarations = [
     StructSigactionDefn,
+    SighandlerTDefn,
   ];
 
   let Functions = [
@@ -205,6 +212,7 @@
     "sigprocmask",
     "sigemptyset",
     "sigaddset",
+    "signal",
   ];
 }
 
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -23,6 +23,7 @@
     sigaddset
     sigemptyset
     sigprocmask
+    signal
 
     # stdlib.h entrypoints
     _Exit
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -16,6 +16,8 @@
 
   PtrType IntPtr = PtrType<IntType>;
 
+  NamedType SigHandlerT = NamedType<"__sighandler_t">;
+
   HeaderSpec Assert = HeaderSpec<
       "assert.h",
       [
@@ -226,10 +228,16 @@
       ],
       [
         SizeTType,
+        SigHandlerT,
       ],
       [], // Enumerations
       [
         FunctionSpec<"raise", RetValSpec<IntType>, [ArgSpec<IntType>]>,
+        FunctionSpec<
+          "signal",
+          RetValSpec<SigHandlerT>,
+          [ArgSpec<IntType>, ArgSpec<SigHandlerT>]
+        >,
       ]
   >;
 
diff --git a/libc/src/signal/linux/CMakeLists.txt b/libc/src/signal/linux/CMakeLists.txt
--- a/libc/src/signal/linux/CMakeLists.txt
+++ b/libc/src/signal/linux/CMakeLists.txt
@@ -84,3 +84,15 @@
     errno_h
     signal_h
 )
+
+add_entrypoint_object(
+  signal
+  SRCS
+    signal.cpp
+  HDRS
+    signal.h
+    ../signal.h
+  DEPENDS
+    sigaction
+    signal_h
+)
diff --git a/libc/src/signal/linux/signal.cpp b/libc/src/signal/linux/signal.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/signal/linux/signal.cpp
@@ -0,0 +1,26 @@
+//===------------------ Linux implementation of signal --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define __LLVM_LIBC_INTERNAL_SIGACTION
+#include "src/signal/signal.h"
+#include "src/signal/sigaction.h"
+
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+sighandler_t LLVM_LIBC_ENTRYPOINT(signal)(int signum, sighandler_t handler) {
+  struct __sigaction action, old;
+  action.sa_handler = handler;
+  action.sa_flags = SA_RESTART;
+  // Errno will already be set so no need to worry about changing errno here.
+  return __llvm_libc::sigaction(signum, &action, &old) == -1 ? SIG_ERR
+                                                             : old.sa_handler;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/signal/signal.h b/libc/src/signal/signal.h
new file mode 100644
--- /dev/null
+++ b/libc/src/signal/signal.h
@@ -0,0 +1,22 @@
+//===------------- Implementation header for signal ------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SIGNAL_SIGNAL_H
+#define LLVM_LIBC_SRC_SIGNAL_SIGNAL_H
+
+#include "include/signal.h"
+
+namespace __llvm_libc {
+
+using sighandler_t = __sighandler_t;
+
+sighandler_t signal(int signum, sighandler_t handler);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SIGNAL_SIGNAL_H
diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt
--- a/libc/test/src/signal/CMakeLists.txt
+++ b/libc/test/src/signal/CMakeLists.txt
@@ -52,3 +52,18 @@
     signal_h
     __errno_location
 )
+
+add_libc_unittest(
+  signal_test
+  SUITE
+    libc_signal_unittests
+  SRCS
+    signal_test.cpp
+  DEPENDS
+    signal
+    signal_h
+    sigaction
+    raise
+    __errno_location
+    errno_h
+)
diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/src/signal/signal_test.cpp
@@ -0,0 +1,41 @@
+//===------------------------ Unittests for signal ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/errno.h"
+#include "include/signal.h"
+#include "src/errno/llvmlibc_errno.h"
+#include "src/signal/raise.h"
+#include "src/signal/signal.h"
+
+#include "utils/UnitTest/ErrnoSetterMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::testing::ErrnoSetterMatcher::Fails;
+using __llvm_libc::testing::ErrnoSetterMatcher::Succeeds;
+
+TEST(Signal, Invalid) {
+  llvmlibc_errno = 0;
+  __llvm_libc::sighandler_t valid = +[](int) {};
+  EXPECT_THAT((void *)__llvm_libc::signal(0, valid),
+              Fails(EINVAL, (void *)SIG_ERR));
+  EXPECT_THAT((void *)__llvm_libc::signal(65, valid),
+              Fails(EINVAL, (void *)SIG_ERR));
+}
+
+static int sum;
+TEST(Signal, Basic) {
+  // In case test get run multiple times.
+  sum = 0;
+  ASSERT_NE(__llvm_libc::signal(SIGUSR1, +[](int) { sum++; }),
+            SIG_ERR);
+  ASSERT_THAT(__llvm_libc::raise(SIGUSR1), Succeeds());
+  EXPECT_EQ(sum, 1);
+  for (int i = 0; i < 10; i++)
+    ASSERT_THAT(__llvm_libc::raise(SIGUSR1), Succeeds());
+  EXPECT_EQ(sum, 11);
+}
diff --git a/libc/utils/UnitTest/Test.h b/libc/utils/UnitTest/Test.h
--- a/libc/utils/UnitTest/Test.h
+++ b/libc/utils/UnitTest/Test.h
@@ -249,13 +249,20 @@
 #define UNIQUE_VAR(prefix) __CAT(prefix, __LINE__)
 
 #define EXPECT_THAT(MATCH, MATCHER)                                            \
-  auto UNIQUE_VAR(__matcher) = (MATCHER);                                      \
-  __llvm_libc::testing::Test::testMatch(                                       \
-      Ctx, UNIQUE_VAR(__matcher).match((MATCH)), UNIQUE_VAR(__matcher),        \
-      #MATCH, #MATCHER, __FILE__, __LINE__)
+  do {                                                                         \
+    auto UNIQUE_VAR(__matcher) = (MATCHER);                                    \
+    __llvm_libc::testing::Test::testMatch(                                     \
+        Ctx, UNIQUE_VAR(__matcher).match((MATCH)), UNIQUE_VAR(__matcher),      \
+        #MATCH, #MATCHER, __FILE__, __LINE__);                                 \
+  } while (0)
 
 #define ASSERT_THAT(MATCH, MATCHER)                                            \
-  if (!EXPECT_THAT(MATCH, MATCHER))                                            \
-  return
+  do {                                                                         \
+    auto UNIQUE_VAR(__matcher) = (MATCHER);                                    \
+    if (!__llvm_libc::testing::Test::testMatch(                                \
+            Ctx, UNIQUE_VAR(__matcher).match((MATCH)), UNIQUE_VAR(__matcher),  \
+            #MATCH, #MATCHER, __FILE__, __LINE__))                             \
+      return;                                                                  \
+  } while (0)
 
 #endif // LLVM_LIBC_UTILS_UNITTEST_H
diff --git a/libc/utils/UnitTest/Test.cpp b/libc/utils/UnitTest/Test.cpp
--- a/libc/utils/UnitTest/Test.cpp
+++ b/libc/utils/UnitTest/Test.cpp
@@ -261,7 +261,7 @@
   if (Result.timedOut()) {
     Ctx.markFail();
     llvm::outs() << File << ":" << Line << ": FAILURE\n"
-                 << "Process timed out after " << 500 << " miliseconds.\n";
+                 << "Process timed out after " << 500 << " milliseconds.\n";
     return false;
   }
 
@@ -305,7 +305,7 @@
   if (Result.timedOut()) {
     Ctx.markFail();
     llvm::outs() << File << ":" << Line << ": FAILURE\n"
-                 << "Process timed out after " << 500 << " miliseconds.\n";
+                 << "Process timed out after " << 500 << " milliseconds.\n";
     return false;
   }
 
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -19,7 +19,7 @@
 
 Please see the `Lit Command Guide`_ for more information about LIT.
 
-.. _LIT Command Guide: http://llvm.org/docs/CommandGuide/lit.html
+.. _LIT Command Guide: https://llvm.org/docs/CommandGuide/lit.html
 
 Setting up the Environment
 --------------------------
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -180,7 +180,7 @@
   Since libc++ 4.0 this extension has been disabled by default. This macro
   may be defined to re-enable it in order to support existing code that depends
   on the extension. New use of this extension should be discouraged.
-  See `PR 27374 <http://llvm.org/PR27374>`_ for more information.
+  See `PR 27374 <https://llvm.org/PR27374>`_ for more information.
 
   Note: The "reduced-arity-initialization" extension is still offered but only
   for explicit conversions. Example:
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -161,8 +161,8 @@
 Getting Involved
 ================
 
-First please review our `Developer's Policy <http://llvm.org/docs/DeveloperPolicy.html>`__
-and `Getting started with LLVM <http://llvm.org/docs/GettingStarted.html>`__.
+First please review our `Developer's Policy <https://llvm.org/docs/DeveloperPolicy.html>`__
+and `Getting started with LLVM <https://llvm.org/docs/GettingStarted.html>`__.
 
 **Bug Reports**
 
@@ -173,7 +173,7 @@
 **Patches**
 
 If you want to contribute a patch to libc++, the best place for that is
-`Phabricator <http://llvm.org/docs/Phabricator.html>`_. Please add `libcxx-commits` as a subscriber.
+`Phabricator <https://llvm.org/docs/Phabricator.html>`_. Please add `libcxx-commits` as a subscriber.
 Also make sure you are subscribed to the `libcxx-commits mailing list <http://lists.llvm.org/mailman/listinfo/libcxx-commits>`_.
 
 **Discussion and Questions**
@@ -185,7 +185,7 @@
 
 Quick Links
 ===========
-* `LLVM Homepage <http://llvm.org/>`_
+* `LLVM Homepage <https://llvm.org/>`_
 * `libc++abi Homepage <http://libcxxabi.llvm.org/>`_
 * `LLVM Bugzilla <https://bugs.llvm.org/>`_
 * `libcxx-commits Mailing List`_
diff --git a/libunwind/docs/BuildingLibunwind.rst b/libunwind/docs/BuildingLibunwind.rst
--- a/libunwind/docs/BuildingLibunwind.rst
+++ b/libunwind/docs/BuildingLibunwind.rst
@@ -57,8 +57,8 @@
 
   $ cd where-you-want-libunwind-to-live
   $ # Check out llvm, and libunwind
-  $ ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm``
-  $ ``svn co http://llvm.org/svn/llvm-project/libunwind/trunk libunwind``
+  $ ``svn co https://llvm.org/svn/llvm-project/llvm/trunk llvm``
+  $ ``svn co https://llvm.org/svn/llvm-project/libunwind/trunk libunwind``
   $ cd where-you-want-to-build
   $ mkdir build && cd build
   $ export CC=clang CXX=clang++
diff --git a/libunwind/docs/index.rst b/libunwind/docs/index.rst
--- a/libunwind/docs/index.rst
+++ b/libunwind/docs/index.rst
@@ -71,8 +71,8 @@
 Getting Involved
 ================
 
-First please review our `Developer's Policy <http://llvm.org/docs/DeveloperPolicy.html>`__
-and `Getting started with LLVM <http://llvm.org/docs/GettingStarted.html>`__.
+First please review our `Developer's Policy <https://llvm.org/docs/DeveloperPolicy.html>`__
+and `Getting started with LLVM <https://llvm.org/docs/GettingStarted.html>`__.
 
 **Bug Reports**
 
@@ -84,7 +84,7 @@
 **Patches**
 
 If you want to contribute a patch to libunwind, the best place for that is
-`Phabricator <http://llvm.org/docs/Phabricator.html>`_. Please include [libunwind] in the subject and
+`Phabricator <https://llvm.org/docs/Phabricator.html>`_. Please include [libunwind] in the subject and
 add `cfe-commits` as a subscriber. Also make sure you are subscribed to the
 `cfe-commits mailing list <http://lists.llvm.org/mailman/listinfo/cfe-commits>`_.
 
@@ -97,7 +97,7 @@
 
 Quick Links
 ===========
-* `LLVM Homepage <http://llvm.org/>`_
+* `LLVM Homepage <https://llvm.org/>`_
 * `LLVM Bugzilla <https://bugs.llvm.org/>`_
 * `cfe-commits Mailing List`_
 * `cfe-dev Mailing List`_
diff --git a/lld/docs/AtomLLD.rst b/lld/docs/AtomLLD.rst
--- a/lld/docs/AtomLLD.rst
+++ b/lld/docs/AtomLLD.rst
@@ -59,4 +59,4 @@
 * :ref:`genindex`
 * :ref:`search`
 
-__ http://llvm.org/docs/DeveloperPolicy.html#license
+__ https://llvm.org/docs/DeveloperPolicy.html#license
diff --git a/lld/docs/NewLLD.rst b/lld/docs/NewLLD.rst
--- a/lld/docs/NewLLD.rst
+++ b/lld/docs/NewLLD.rst
@@ -248,7 +248,7 @@
 so that they are linked as if they were in the native format from the beginning.
 
 The details are described in this document.
-http://llvm.org/docs/LinkTimeOptimization.html
+https://llvm.org/docs/LinkTimeOptimization.html
 
 Glossary
 --------
diff --git a/lld/docs/design.rst b/lld/docs/design.rst
--- a/lld/docs/design.rst
+++ b/lld/docs/design.rst
@@ -326,7 +326,7 @@
 
 The lld project contains a test suite which is being built up as new code is
 added to lld.  All new lld functionality should have a tests added to the test
-suite.  The test suite is `lit <http://llvm.org/cmds/lit.html/>`_ driven.  Each
+suite.  The test suite is `lit <https://llvm.org/cmds/lit.html/>`_ driven.  Each
 test is a text file with comments telling lit how to run the test and check the
 result To facilitate testing, the lld project builds a tool called lld-core.
 This tool reads a YAML file (default from stdin), parses it into one or more
diff --git a/lld/docs/development.rst b/lld/docs/development.rst
--- a/lld/docs/development.rst
+++ b/lld/docs/development.rst
@@ -6,7 +6,7 @@
 Note: this document discuss Mach-O port of LLD. For ELF and COFF,
 see :doc:`index`.
 
-lld is developed as part of the `LLVM <http://llvm.org>`_ project.
+lld is developed as part of the `LLVM <https://llvm.org>`_ project.
 
 Creating a Reader
 -----------------
diff --git a/lld/docs/getting_started.rst b/lld/docs/getting_started.rst
--- a/lld/docs/getting_started.rst
+++ b/lld/docs/getting_started.rst
@@ -6,7 +6,7 @@
 This page gives you the shortest path to checking out and building lld. If you
 run into problems, please file bugs in the `LLVM Bugzilla`__
 
-__ http://llvm.org/bugs/
+__ https://bugs.llvm.org/
 
 Building lld
 ------------
@@ -84,4 +84,4 @@
 
 For more information on using CMake see the `LLVM CMake guide`_.
 
-.. _LLVM CMake guide: http://llvm.org/docs/CMake.html
+.. _LLVM CMake guide: https://llvm.org/docs/CMake.html
diff --git a/lld/docs/index.rst b/lld/docs/index.rst
--- a/lld/docs/index.rst
+++ b/lld/docs/index.rst
@@ -98,7 +98,7 @@
 If you have already checked out LLVM using SVN, you can check out LLD
 under ``tools`` directory just like you probably did for clang. For the
 details, see `Getting Started with the LLVM System
-<http://llvm.org/docs/GettingStarted.html>`_.
+<https://llvm.org/docs/GettingStarted.html>`_.
 
 If you haven't checked out LLVM, the easiest way to build LLD is to
 check out the entire LLVM projects/sub-projects from a git mirror and
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -9977,4 +9977,4 @@
 .. [SEMVER] `Semantic Versioning <https://semver.org/>`__
 .. [OpenCL] `The OpenCL Specification Version 2.0 <http://www.khronos.org/registry/cl/specs/opencl-2.0.pdf>`__
 .. [HRF] `Heterogeneous-race-free Memory Models <http://benedictgaster.org/wp-content/uploads/2014/01/asplos269-FINAL.pdf>`__
-.. [CLANG-ATTR] `Attributes in Clang <http://clang.llvm.org/docs/AttributeReference.html>`__
+.. [CLANG-ATTR] `Attributes in Clang <https://clang.llvm.org/docs/AttributeReference.html>`__
diff --git a/llvm/docs/AliasAnalysis.rst b/llvm/docs/AliasAnalysis.rst
--- a/llvm/docs/AliasAnalysis.rst
+++ b/llvm/docs/AliasAnalysis.rst
@@ -19,7 +19,7 @@
 same object, or are known to never point to the same object.
 
 The LLVM `AliasAnalysis
-<http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__ class is the
+<https://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__ class is the
 primary interface used by clients and implementations of alias analyses in the
 LLVM system.  This class is the common interface between clients of alias
 analysis information and the implementations providing it, and is designed to
@@ -36,7 +36,7 @@
 ``AliasAnalysis`` Class Overview
 ================================
 
-The `AliasAnalysis <http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__
+The `AliasAnalysis <https://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__
 class defines the interface that the various alias analysis implementations
 should support.  This class exports two important enums: ``AliasResult`` and
 ``ModRefResult`` which represent the result of an alias query or a mod/ref
@@ -264,7 +264,7 @@
 ---------------------------------
 
 All of the `AliasAnalysis
-<http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__ virtual methods
+<https://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__ virtual methods
 default to providing :ref:`chaining <aliasanalysis-chaining>` to another alias
 analysis implementation, which ends up returning conservatively correct
 information (returning "May" Alias and "Mod/Ref" for alias and mod/ref queries
@@ -435,7 +435,7 @@
 
 Many transformations need information about alias **sets** that are active in
 some scope, rather than information about pairwise aliasing.  The
-`AliasSetTracker <http://llvm.org/doxygen/classllvm_1_1AliasSetTracker.html>`__
+`AliasSetTracker <https://llvm.org/doxygen/classllvm_1_1AliasSetTracker.html>`__
 class is used to efficiently build these Alias Sets from the pairwise alias
 analysis information provided by the ``AliasAnalysis`` interface.
 
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -601,7 +601,7 @@
 
 **LLVM_BUILD_INSTRUMENTED_COVERAGE**:BOOL
   If enabled, `source-based code coverage
-  <http://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_ instrumentation
+  <https://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_ instrumentation
   is enabled while building llvm.
 
 **LLVM_CCACHE_BUILD**:BOOL
diff --git a/llvm/docs/CommandGuide/llvm-lipo.rst b/llvm/docs/CommandGuide/llvm-lipo.rst
--- a/llvm/docs/CommandGuide/llvm-lipo.rst
+++ b/llvm/docs/CommandGuide/llvm-lipo.rst
@@ -70,4 +70,4 @@
 BUGS
 ----
 
-To report bugs, please visit <http://llvm.org/bugs/>.
+To report bugs, please visit <https://bugs.llvm.org/>.
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -545,7 +545,7 @@
 BUGS
 ----
 
-To report bugs, please visit <http://llvm.org/bugs/>.
+To report bugs, please visit <https://bugs.llvm.org/>.
 
 There is a known issue with :option:`--input-target` and :option:`--target`
 causing only ``binary`` and ``ihex`` formats to have any effect. Other values
diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -324,7 +324,7 @@
 BUGS
 ----
 
-To report bugs, please visit <http://llvm.org/bugs/>.
+To report bugs, please visit <https://bugs.llvm.org/>.
 
 SEE ALSO
 --------
diff --git a/llvm/docs/CommandGuide/llvm-size.rst b/llvm/docs/CommandGuide/llvm-size.rst
--- a/llvm/docs/CommandGuide/llvm-size.rst
+++ b/llvm/docs/CommandGuide/llvm-size.rst
@@ -195,4 +195,4 @@
 BUGS
 ----
 
-To report bugs, please visit <http://llvm.org/bugs/>.
+To report bugs, please visit <https://bugs.llvm.org/>.
diff --git a/llvm/docs/CommandGuide/llvm-strings.rst b/llvm/docs/CommandGuide/llvm-strings.rst
--- a/llvm/docs/CommandGuide/llvm-strings.rst
+++ b/llvm/docs/CommandGuide/llvm-strings.rst
@@ -127,4 +127,4 @@
 BUGS
 ----
 
-To report bugs, please visit <http://llvm.org/bugs/>.
+To report bugs, please visit <https://bugs.llvm.org/>.
diff --git a/llvm/docs/CommandGuide/llvm-strip.rst b/llvm/docs/CommandGuide/llvm-strip.rst
--- a/llvm/docs/CommandGuide/llvm-strip.rst
+++ b/llvm/docs/CommandGuide/llvm-strip.rst
@@ -190,7 +190,7 @@
 BUGS
 ----
 
-To report bugs, please visit <http://llvm.org/bugs/>.
+To report bugs, please visit <https://bugs.llvm.org/>.
 
 SEE ALSO
 --------
diff --git a/llvm/docs/CompileCudaWithLLVM.rst b/llvm/docs/CompileCudaWithLLVM.rst
--- a/llvm/docs/CompileCudaWithLLVM.rst
+++ b/llvm/docs/CompileCudaWithLLVM.rst
@@ -30,7 +30,7 @@
 `NVIDIA's CUDA installation guide
 <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html>`_ for
 details.  Note that clang `maynot support
-<https://llvm.org/bugs/show_bug.cgi?id=26966>`_ the CUDA toolkit as installed by
+<https://bugs.llvm.org/show_bug.cgi?id=26966>`_ the CUDA toolkit as installed by
 some Linux package managers. Clang does attempt to deal with specific details of
 CUDA installation on a handful of common Linux distributions, but in general the
 most reliable way to make it work is to install CUDA in a single directory from
@@ -342,7 +342,7 @@
 When resolving an overloaded function, clang considers the host/device
 attributes of the caller and callee.  These are used as a tiebreaker during
 overload resolution.  See `IdentifyCUDAPreference
-<http://clang.llvm.org/doxygen/SemaCUDA_8cpp.html>`_ for the full set of rules,
+<https://clang.llvm.org/doxygen/SemaCUDA_8cpp.html>`_ for the full set of rules,
 but at a high level they are:
 
  * D functions prefer to call other Ds.  HDs are given lower priority.
@@ -507,12 +507,12 @@
   reduce redundancy within straight-line code.
 
 * `Aggressive speculative execution
-  <http://llvm.org/docs/doxygen/html/SpeculativeExecution_8cpp_source.html>`_
+  <https://llvm.org/docs/doxygen/html/SpeculativeExecution_8cpp_source.html>`_
   -- This is mainly for promoting straight-line scalar optimizations, which are
   most effective on code along dominator paths.
 
 * `Memory space inference
-  <http://llvm.org/doxygen/NVPTXInferAddressSpaces_8cpp_source.html>`_ --
+  <https://llvm.org/doxygen/NVPTXInferAddressSpaces_8cpp_source.html>`_ --
   In PTX, we can operate on pointers that are in a particular "address space"
   (global, shared, constant, or local), or we can operate on pointers in the
   "generic" address space, which can point to anything.  Operations in a
@@ -521,7 +521,7 @@
   possible.
 
 * `Bypassing 64-bit divides
-  <http://llvm.org/docs/doxygen/html/BypassSlowDivision_8cpp_source.html>`_ --
+  <https://llvm.org/docs/doxygen/html/BypassSlowDivision_8cpp_source.html>`_ --
   This was an existing optimization that we enabled for the PTX backend.
 
   64-bit integer divides are much slower than 32-bit ones on NVIDIA GPUs.
@@ -536,7 +536,7 @@
   SROA, which sometimes speed up code by over 10x.
 
   (Programmers can force unrolling and inline using clang's `loop unrolling pragmas
-  <http://clang.llvm.org/docs/AttributeReference.html#pragma-unroll-pragma-nounroll>`_
+  <https://clang.llvm.org/docs/AttributeReference.html#pragma-unroll-pragma-nounroll>`_
   and ``__attribute__((always_inline))``.)
 
 Publication
@@ -558,4 +558,4 @@
 ==============
 
 To obtain help on LLVM in general and its CUDA support, see `the LLVM
-community <http://llvm.org/docs/#mailing-lists>`_.
+community <https://llvm.org/docs/#mailing-lists>`_.
diff --git a/llvm/docs/Docker.rst b/llvm/docs/Docker.rst
--- a/llvm/docs/Docker.rst
+++ b/llvm/docs/Docker.rst
@@ -51,7 +51,7 @@
 The ``llvm/utils/docker`` folder contains Dockerfiles and simple bash scripts to
 serve as a basis for anyone who wants to create their own Docker image with
 LLVM components, compiled from sources. The sources are checked out from the
-upstream svn repository when building the image.
+upstream git repository when building the image.
 
 The resulting image contains only the requested LLVM components and a few extra
 packages to make the image minimally useful for C++ development, e.g. libstdc++
@@ -68,7 +68,7 @@
 =====
 The ``llvm/utils/build_docker_image.sh`` script provides a rather high degree of
 control on how to run the build. It allows you to specify the projects to
-checkout from svn and provide a list of CMake arguments to use during when
+checkout from git and provide a list of CMake arguments to use during when
 building LLVM inside docker container.
 
 Here's a very simple example of getting a docker image with clang binary,
diff --git a/llvm/docs/FAQ.rst b/llvm/docs/FAQ.rst
--- a/llvm/docs/FAQ.rst
+++ b/llvm/docs/FAQ.rst
@@ -12,8 +12,8 @@
 Can I modify LLVM source code and redistribute the modified source?
 -------------------------------------------------------------------
 Yes.  The modified source distribution must retain the copyright notice and
-follow the conditions listed in the `LLVM license
-<http://llvm.org/svn/llvm-project/llvm/trunk/LICENSE.TXT>`_.
+follow the conditions listed in the `Apache License v2.0 with LLVM Exceptions
+<https://github.com/llvm/llvm-project/blob/master/llvm/LICENSE.TXT>`_.
 
 
 Can I modify the LLVM source code and redistribute binaries or other tools based on it, without redistributing the source?
@@ -72,9 +72,9 @@
 ------------------------------------
 
 LLVM currently has full support for C and C++ source languages through
-`Clang <http://clang.llvm.org/>`_. Many other language frontends have
+`Clang <https://clang.llvm.org/>`_. Many other language frontends have
 been written using LLVM, and an incomplete list is available at
-`projects with LLVM <http://llvm.org/ProjectsWithLLVM/>`_.
+`projects with LLVM <https://llvm.org/ProjectsWithLLVM/>`_.
 
 
 I'd like to write a self-hosting LLVM compiler. How should I interface with the LLVM middle-end optimizers and back-end code generators?
diff --git a/llvm/docs/Frontend/PerformanceTips.rst b/llvm/docs/Frontend/PerformanceTips.rst
--- a/llvm/docs/Frontend/PerformanceTips.rst
+++ b/llvm/docs/Frontend/PerformanceTips.rst
@@ -27,7 +27,7 @@
 to model and see what decisions Clang's IRGen makes about what IR to emit.
 Studying Clang's CodeGen directory can also be a good source of ideas.  Note
 that Clang and LLVM are explicitly version locked so you'll need to make sure
-you're using a Clang built from the same svn revision or release as the LLVM
+you're using a Clang built from the same git revision or release as the LLVM
 library you're using.  As always, it's *strongly* recommended that you track
 tip of tree development, particularly during bring up of a new project.
 
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -16,7 +16,7 @@
 object files.  Tools include an assembler, disassembler, bitcode analyzer, and
 bitcode optimizer.  It also contains basic regression tests.
 
-C-like languages use the `Clang <http://clang.llvm.org/>`_ front end.  This
+C-like languages use the `Clang <https://clang.llvm.org/>`_ front end.  This
 component compiles C, C++, Objective C, and Objective C++ code into LLVM bitcode
 -- and from there into object files, using LLVM.
 
@@ -28,7 +28,7 @@
 =========================================
 
 The LLVM Getting Started documentation may be out of date.  The `Clang
-Getting Started <http://clang.llvm.org/get_started.html>`_ page might have more
+Getting Started <https://clang.llvm.org/get_started.html>`_ page might have more
 accurate information.
 
 This is an example workflow and configuration to get and build the LLVM source:
@@ -522,7 +522,7 @@
 do it like so:
 
 * ``cd where-you-want-llvm-to-live``
-* Read-Only: ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm``
+* Read-Only: ``svn co https://llvm.org/svn/llvm-project/llvm/trunk llvm``
 * Read-Write: ``svn co https://user@llvm.org/svn/llvm-project/llvm/trunk llvm``
 
 This will create an '``llvm``' directory in the current directory and fully
@@ -722,7 +722,7 @@
 iOS due to limitations in the iOS SDK.
 
 Check :doc:`HowToCrossCompileLLVM` and `Clang docs on how to cross-compile in general
-<http://clang.llvm.org/docs/CrossCompilation.html>`_ for more information
+<https://clang.llvm.org/docs/CrossCompilation.html>`_ for more information
 about cross-compiling.
 
 The Location of LLVM Object Files
@@ -789,7 +789,7 @@
 
 One useful source of information about the LLVM source base is the LLVM `doxygen
 <http://www.doxygen.org/>`_ documentation available at
-`<http://llvm.org/doxygen/>`_.  The following is a brief introduction to code
+`<https://llvm.org/doxygen/>`_.  The following is a brief introduction to code
 layout:
 
 ``llvm/examples``
@@ -1105,8 +1105,8 @@
 that aren't documented here (but we'll gladly accept a patch if you want to
 write something up!).  For more information about LLVM, check out:
 
-* `LLVM Homepage <http://llvm.org/>`_
-* `LLVM Doxygen Tree <http://llvm.org/doxygen/>`_
-* `Starting a Project that Uses LLVM <http://llvm.org/docs/Projects.html>`_
+* `LLVM Homepage <https://llvm.org/>`_
+* `LLVM Doxygen Tree <https://llvm.org/doxygen/>`_
+* `Starting a Project that Uses LLVM <https://llvm.org/docs/Projects.html>`_
 
 .. _installing arcanist: https://secure.phabricator.com/book/phabricator/article/arcanist_quick_start/
diff --git a/llvm/docs/GettingStartedVS.rst b/llvm/docs/GettingStartedVS.rst
--- a/llvm/docs/GettingStartedVS.rst
+++ b/llvm/docs/GettingStartedVS.rst
@@ -18,7 +18,7 @@
 bitcode optimizer. It also contains basic regression tests that can be used to
 test the LLVM tools and the Clang front end.
 
-The second piece is the `Clang <http://clang.llvm.org/>`_ front end.  This
+The second piece is the `Clang <https://clang.llvm.org/>`_ front end.  This
 component compiles C, C++, Objective C, and Objective C++ code into LLVM
 bitcode. Clang typically uses LLVM libraries to optimize the bitcode and emit
 machine code. LLVM fully supports the COFF object file format, which is
@@ -74,15 +74,12 @@
          (*or use WinZip*)
       3. ``cd llvm``
 
-   * With anonymous Subversion access:
+   * With git access:
 
-     *Note:* some regression tests require Unix-style line ending (``\n``). To
-     pass all regression tests, please add two lines *enable-auto-props = yes*
-     and *\* = svn:mime-type=application/octet-stream* to
-     ``C:\Users\<username>\AppData\Roaming\Subversion\config``.
+     *Note:* some regression tests require Unix-style line ending (``\n``).
 
       1. ``cd <where-you-want-llvm-to-live>``
-      2. ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm``
+      2. ``git clone https://github.com/llvm/llvm-project.git llvm``
       3. ``cd llvm``
 
 5. Use `CMake <http://www.cmake.org/>`_ to generate up-to-date project files:
@@ -103,7 +100,7 @@
    * See the :doc:`LLVM CMake guide <CMake>` for detailed information about
      how to configure the LLVM build.
    * CMake generates project files for all build types. To select a specific
-     build type, use the Configuration manager from the VS IDE or the 
+     build type, use the Configuration manager from the VS IDE or the
      ``/property:Configuration`` command line option when using MSBuild.
    * By default, the Visual Studio project files generated by CMake use the
      32-bit toolset. If you are developing on a 64-bit version of Windows and
@@ -236,6 +233,6 @@
 do that aren't documented here (but we'll gladly accept a patch if you want to
 write something up!).  For more information about LLVM, check out:
 
-* `LLVM homepage <http://llvm.org/>`_
-* `LLVM doxygen tree <http://llvm.org/doxygen/>`_
+* `LLVM homepage <https://llvm.org/>`_
+* `LLVM doxygen tree <https://llvm.org/doxygen/>`_
 
diff --git a/llvm/docs/GlobalISel/GMIR.rst b/llvm/docs/GlobalISel/GMIR.rst
--- a/llvm/docs/GlobalISel/GMIR.rst
+++ b/llvm/docs/GlobalISel/GMIR.rst
@@ -76,7 +76,7 @@
   way of getting a given operand's type (as there was no 1:1 mapping between
   instruction types and operands).
   We considered putting the type in some variant of MCInstrDesc instead:
-  See `PR26576 <http://llvm.org/PR26576>`_: [GlobalISel] Generic MachineInstrs
+  See `PR26576 <https://llvm.org/PR26576>`_: [GlobalISel] Generic MachineInstrs
   need a type but this increases the memory footprint of the related objects
 
 .. _gmir-regbank:
diff --git a/llvm/docs/GlobalISel/IRTranslator.rst b/llvm/docs/GlobalISel/IRTranslator.rst
--- a/llvm/docs/GlobalISel/IRTranslator.rst
+++ b/llvm/docs/GlobalISel/IRTranslator.rst
@@ -73,7 +73,7 @@
 As some of the bits are undef (padding), we should consider augmenting the
 representation with additional metadata (in effect, caching computeKnownBits
 information on vregs).
-See `PR26161 <http://llvm.org/PR26161>`_: [GlobalISel] Value to vreg during
+See `PR26161 <https://llvm.org/PR26161>`_: [GlobalISel] Value to vreg during
 IR to MachineInstr translation for aggregate type
 
 .. _irtranslator-constants:
diff --git a/llvm/docs/GlobalISel/KnownBits.rst b/llvm/docs/GlobalISel/KnownBits.rst
--- a/llvm/docs/GlobalISel/KnownBits.rst
+++ b/llvm/docs/GlobalISel/KnownBits.rst
@@ -97,4 +97,4 @@
   }
 
 There are many more API's beyond ``getKnownBits()``. See the `API reference
-<http://llvm.org/doxygen>`_ for more information
+<https://llvm.org/doxygen>`_ for more information
diff --git a/llvm/docs/HistoricalNotes/2007-OriginalClangReadme.txt b/llvm/docs/HistoricalNotes/2007-OriginalClangReadme.txt
--- a/llvm/docs/HistoricalNotes/2007-OriginalClangReadme.txt
+++ b/llvm/docs/HistoricalNotes/2007-OriginalClangReadme.txt
@@ -125,7 +125,7 @@
      invoking Graphviz.
 
      For more information on getting Graphviz to work with clang/LLVM,
-     see: http://llvm.org/docs/ProgrammersManual.html#ViewGraph
+     see: https://llvm.org/docs/ProgrammersManual.html#ViewGraph
 
 
 III. Current advantages over GCC:
diff --git a/llvm/docs/HowToCrossCompileLLVM.rst b/llvm/docs/HowToCrossCompileLLVM.rst
--- a/llvm/docs/HowToCrossCompileLLVM.rst
+++ b/llvm/docs/HowToCrossCompileLLVM.rst
@@ -9,7 +9,7 @@
 Clang on host machine, targeting another platform.
 
 For more information on how to use Clang as a cross-compiler,
-please check http://clang.llvm.org/docs/CrossCompilation.html.
+please check https://clang.llvm.org/docs/CrossCompilation.html.
 
 TODO: Add MIPS and other platforms to this document.
 
@@ -189,7 +189,7 @@
 
 If you copy that tarball to your target board, you'll be able to use
 it for running the test-suite, for example. Follow the guidelines at
-http://llvm.org/docs/lnt/quickstart.html, unpack the tarball in the
+https://llvm.org/docs/lnt/quickstart.html, unpack the tarball in the
 test directory, and use options:
 
    .. code-block:: bash
diff --git a/llvm/docs/HowToSetUpLLVMStyleRTTI.rst b/llvm/docs/HowToSetUpLLVMStyleRTTI.rst
--- a/llvm/docs/HowToSetUpLLVMStyleRTTI.rst
+++ b/llvm/docs/HowToSetUpLLVMStyleRTTI.rst
@@ -380,8 +380,8 @@
 For example, LLVM-style RTTI can work fine in the presence of
 multiple-inheritance by defining an appropriate ``classof``.
 An example of this in practice is
-`Decl <http://clang.llvm.org/doxygen/classclang_1_1Decl.html>`_ vs.
-`DeclContext <http://clang.llvm.org/doxygen/classclang_1_1DeclContext.html>`_
+`Decl <https://clang.llvm.org/doxygen/classclang_1_1Decl.html>`_ vs.
+`DeclContext <https://clang.llvm.org/doxygen/classclang_1_1DeclContext.html>`_
 inside Clang.
 The ``Decl`` hierarchy is done very similarly to the example setup
 demonstrated in this tutorial.
@@ -396,7 +396,7 @@
    Touch on some of the more advanced features, like ``isa_impl`` and
    ``simplify_type``. However, those two need reference documentation in
    the form of doxygen comments as well. We need the doxygen so that we can
-   say "for full details, see http://llvm.org/doxygen/..."
+   say "for full details, see https://llvm.org/doxygen/..."
 
 Rules of Thumb
 ==============
diff --git a/llvm/docs/HowToSubmitABug.rst b/llvm/docs/HowToSubmitABug.rst
--- a/llvm/docs/HowToSubmitABug.rst
+++ b/llvm/docs/HowToSubmitABug.rst
@@ -26,7 +26,7 @@
 
 * All information necessary to reproduce the problem.
 * The reduced test-case that triggers the bug.
-* The location where you obtained LLVM (if not from our Subversion
+* The location where you obtained LLVM (if not from our Git
   repository).
 
 Thanks for helping us make LLVM better!
diff --git a/llvm/docs/LLVMBuild.txt b/llvm/docs/LLVMBuild.txt
--- a/llvm/docs/LLVMBuild.txt
+++ b/llvm/docs/LLVMBuild.txt
@@ -10,7 +10,7 @@
 ;
 ; For more information on the LLVMBuild system, please see:
 ;
-;   http://llvm.org/docs/LLVMBuild.html
+;   https://llvm.org/docs/LLVMBuild.html
 ;
 ;===------------------------------------------------------------------------===;
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1692,7 +1692,7 @@
     functions.
 ``safestack``
     This attribute indicates that
-    `SafeStack <http://clang.llvm.org/docs/SafeStack.html>`_
+    `SafeStack <https://clang.llvm.org/docs/SafeStack.html>`_
     protection is enabled for this function.
 
     If a function that has a ``safestack`` attribute is inlined into a
@@ -6690,7 +6690,7 @@
 ^^^^^^^^^^
 
 The optional ``TypeIdInfo`` field, used for
-`Control Flow Integrity <http://clang.llvm.org/docs/ControlFlowIntegrity.html>`_,
+`Control Flow Integrity <https://clang.llvm.org/docs/ControlFlowIntegrity.html>`_,
 looks like:
 
 .. code-block:: text
@@ -6767,7 +6767,7 @@
 
 Each type id summary entry corresponds to a type identifier resolution
 which is generated during the LTO link portion of the compile when building
-with `Control Flow Integrity <http://clang.llvm.org/docs/ControlFlowIntegrity.html>`_,
+with `Control Flow Integrity <https://clang.llvm.org/docs/ControlFlowIntegrity.html>`_,
 so these are only present in a combined summary index.
 
 Example:
diff --git a/llvm/docs/Lexicon.rst b/llvm/docs/Lexicon.rst
--- a/llvm/docs/Lexicon.rst
+++ b/llvm/docs/Lexicon.rst
@@ -112,7 +112,7 @@
 **GEP**
     ``GetElementPtr``. An LLVM IR instruction that is used to get the address
     of a subelement of an aggregate data structure. It is documented in detail
-    `here <http://llvm.org/docs/GetElementPtr.html>`_.
+    `here <https://llvm.org/docs/GetElementPtr.html>`_.
 
 **GVN**
     Global Value Numbering. GVN is a pass that partitions values computed by a
diff --git a/llvm/docs/LibFuzzer.rst b/llvm/docs/LibFuzzer.rst
--- a/llvm/docs/LibFuzzer.rst
+++ b/llvm/docs/LibFuzzer.rst
@@ -581,7 +581,7 @@
 One easy to use metric is, of course, code coverage.
 
 We recommend to use
-`Clang Coverage <http://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_,
+`Clang Coverage <https://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_,
 to visualize and study your code coverage
 (`example <https://github.com/google/fuzzer-test-suite/blob/master/tutorial/libFuzzerTutorial.md#visualizing-coverage>`_).
 
@@ -768,7 +768,7 @@
 
 * WOFF2: `[1] <https://github.com/google/woff2/commit/a15a8ab>`__
 
-* LLVM: `Clang <https://llvm.org/bugs/show_bug.cgi?id=23057>`_, `Clang-format <https://llvm.org/bugs/show_bug.cgi?id=23052>`_, `libc++ <https://llvm.org/bugs/show_bug.cgi?id=24411>`_, `llvm-as <https://llvm.org/bugs/show_bug.cgi?id=24639>`_, `Demangler <https://bugs.chromium.org/p/chromium/issues/detail?id=606626>`_, Disassembler: http://reviews.llvm.org/rL247405, http://reviews.llvm.org/rL247414, http://reviews.llvm.org/rL247416, http://reviews.llvm.org/rL247417, http://reviews.llvm.org/rL247420, http://reviews.llvm.org/rL247422.
+* LLVM: `Clang <https://bugs.llvm.org/show_bug.cgi?id=23057>`_, `Clang-format <https://bugs.llvm.org/show_bug.cgi?id=23052>`_, `libc++ <https://bugs.llvm.org/show_bug.cgi?id=24411>`_, `llvm-as <https://bugs.llvm.org/show_bug.cgi?id=24639>`_, `Demangler <https://bugs.chromium.org/p/chromium/issues/detail?id=606626>`_, Disassembler: http://reviews.llvm.org/rL247405, http://reviews.llvm.org/rL247414, http://reviews.llvm.org/rL247416, http://reviews.llvm.org/rL247417, http://reviews.llvm.org/rL247420, http://reviews.llvm.org/rL247422.
 
 * Tensorflow: `[1] <https://da-data.blogspot.com/2017/01/finding-bugs-in-tensorflow-with.html>`__
 
@@ -781,18 +781,18 @@
 .. _pcre2: http://www.pcre.org/
 .. _AFL: http://lcamtuf.coredump.cx/afl/
 .. _Radamsa: https://github.com/aoh/radamsa
-.. _SanitizerCoverage: http://clang.llvm.org/docs/SanitizerCoverage.html
-.. _SanitizerCoverageTraceDataFlow: http://clang.llvm.org/docs/SanitizerCoverage.html#tracing-data-flow
-.. _AddressSanitizer: http://clang.llvm.org/docs/AddressSanitizer.html
-.. _LeakSanitizer: http://clang.llvm.org/docs/LeakSanitizer.html
+.. _SanitizerCoverage: https://clang.llvm.org/docs/SanitizerCoverage.html
+.. _SanitizerCoverageTraceDataFlow: https://clang.llvm.org/docs/SanitizerCoverage.html#tracing-data-flow
+.. _AddressSanitizer: https://clang.llvm.org/docs/AddressSanitizer.html
+.. _LeakSanitizer: https://clang.llvm.org/docs/LeakSanitizer.html
 .. _Heartbleed: http://en.wikipedia.org/wiki/Heartbleed
 .. _FuzzerInterface.h: https://github.com/llvm/llvm-project/blob/master/compiler-rt/lib/fuzzer/FuzzerInterface.h
-.. _3.7.0: http://llvm.org/releases/3.7.0/docs/LibFuzzer.html
-.. _building Clang from trunk: http://clang.llvm.org/get_started.html
-.. _MemorySanitizer: http://clang.llvm.org/docs/MemorySanitizer.html
-.. _UndefinedBehaviorSanitizer: http://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html
-.. _`coverage counters`: http://clang.llvm.org/docs/SanitizerCoverage.html#coverage-counters
+.. _3.7.0: https://llvm.org/releases/3.7.0/docs/LibFuzzer.html
+.. _building Clang from trunk: https://clang.llvm.org/get_started.html
+.. _MemorySanitizer: https://clang.llvm.org/docs/MemorySanitizer.html
+.. _UndefinedBehaviorSanitizer: https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html
+.. _`coverage counters`: https://clang.llvm.org/docs/SanitizerCoverage.html#coverage-counters
 .. _`value profile`: #value-profile
-.. _`caller-callee pairs`: http://clang.llvm.org/docs/SanitizerCoverage.html#caller-callee-coverage
+.. _`caller-callee pairs`: https://clang.llvm.org/docs/SanitizerCoverage.html#caller-callee-coverage
 .. _BoringSSL: https://boringssl.googlesource.com/boringssl/
 
diff --git a/llvm/docs/LoopTerminology.rst b/llvm/docs/LoopTerminology.rst
--- a/llvm/docs/LoopTerminology.rst
+++ b/llvm/docs/LoopTerminology.rst
@@ -152,7 +152,7 @@
 (:ref:`-loop-simplify <passes-loop-simplify>`) pass and is automatically
 added by the pass managers when scheduling a LoopPass.
 This pass is implemented in
-`LoopSimplify.h <http://llvm.org/doxygen/LoopSimplify_8h_source.html>`_.
+`LoopSimplify.h <https://llvm.org/doxygen/LoopSimplify_8h_source.html>`_.
 When it is successful, the loop has:
 
 * A preheader.
@@ -178,7 +178,7 @@
 Loops are rotated by the LoopRotate (:ref:`loop-rotate <passes-loop-rotate>`)
 pass, which converts loops into do/while style loops and is
 implemented in
-`LoopRotation.h <http://llvm.org/doxygen/LoopRotation_8h_source.html>`_.  Example:
+`LoopRotation.h <https://llvm.org/doxygen/LoopRotation_8h_source.html>`_.  Example:
 
 .. code-block:: C
 
diff --git a/llvm/docs/MarkdownQuickstartTemplate.md b/llvm/docs/MarkdownQuickstartTemplate.md
--- a/llvm/docs/MarkdownQuickstartTemplate.md
+++ b/llvm/docs/MarkdownQuickstartTemplate.md
@@ -64,7 +64,7 @@
 
 ### Example Subsection
 
-Make a link [like this](http://llvm.org/). There is also a more
+Make a link [like this](https://llvm.org/). There is also a more
 sophisticated syntax which [can be more readable] for longer links since
 it disrupts the flow less. You can put the `[link name]: <URL>` block
 pretty much anywhere later in the document.
diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst
--- a/llvm/docs/MergeFunctions.rst
+++ b/llvm/docs/MergeFunctions.rst
@@ -39,16 +39,16 @@
 `Single Static Assignment
 <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_
 concept and has an understanding of
-`IR structure <http://llvm.org/docs/LangRef.html#high-level-structure>`_.
+`IR structure <https://llvm.org/docs/LangRef.html#high-level-structure>`_.
 
 We will use terms such as
-"`module <http://llvm.org/docs/LangRef.html#high-level-structure>`_",
-"`function <http://llvm.org/docs/ProgrammersManual.html#the-function-class>`_",
+"`module <https://llvm.org/docs/LangRef.html#high-level-structure>`_",
+"`function <https://llvm.org/docs/ProgrammersManual.html#the-function-class>`_",
 "`basic block <http://en.wikipedia.org/wiki/Basic_block>`_",
-"`user <http://llvm.org/docs/ProgrammersManual.html#the-user-class>`_",
-"`value <http://llvm.org/docs/ProgrammersManual.html#the-value-class>`_",
+"`user <https://llvm.org/docs/ProgrammersManual.html#the-user-class>`_",
+"`value <https://llvm.org/docs/ProgrammersManual.html#the-value-class>`_",
 "`instruction
-<http://llvm.org/docs/ProgrammersManual.html#the-instruction-class>`_".
+<https://llvm.org/docs/ProgrammersManual.html#the-instruction-class>`_".
 
 As a good starting point, the Kaleidoscope tutorial can be used:
 
diff --git a/llvm/docs/Packaging.rst b/llvm/docs/Packaging.rst
--- a/llvm/docs/Packaging.rst
+++ b/llvm/docs/Packaging.rst
@@ -38,7 +38,7 @@
     should turn it back on to let users debug their programs.
 
 ``--enable-optimized``
-    (For svn checkouts) Builds LLVM with ``-O2`` and, by default, turns off
+    (For git checkouts) Builds LLVM with ``-O2`` and, by default, turns off
     debug symbols.  Also available by setting ``ENABLE_OPTIMIZED=0|1`` in
     ``make``'s environment.  This defaults to enabled when not in a
     checkout.
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -24,7 +24,7 @@
 that this manual is not intended to serve as a replacement for reading the
 source code, so if you think there should be a method in one of these classes to
 do something, but it's not listed, check the source.  Links to the `doxygen
-<http://llvm.org/doxygen/>`__ sources are provided to make this as easy as
+<https://llvm.org/doxygen/>`__ sources are provided to make this as easy as
 possible.
 
 The first section of this document describes general information that is useful
@@ -32,7 +32,7 @@
 Core LLVM classes.  In the future this manual will be extended with information
 describing how to use extension libraries, such as dominator information, CFG
 traversal routines, and useful utilities like the ``InstVisitor`` (`doxygen
-<http://llvm.org/doxygen/InstVisitor_8h_source.html>`__) template.
+<https://llvm.org/doxygen/InstVisitor_8h_source.html>`__) template.
 
 .. _general:
 
@@ -108,7 +108,7 @@
 ``dynamic_cast<>`` only works on classes that have a v-table).  Because they are
 used so often, you must know what they do and how they work.  All of these
 templates are defined in the ``llvm/Support/Casting.h`` (`doxygen
-<http://llvm.org/doxygen/Casting_8h_source.html>`__) file (note that you very
+<https://llvm.org/doxygen/Casting_8h_source.html>`__) file (note that you very
 rarely have to include this file directly).
 
 ``isa<>``:
@@ -231,7 +231,7 @@
 Similarly, APIs which need to return a string may return a ``StringRef``
 instance, which can be used directly or converted to an ``std::string`` using
 the ``str`` member function.  See ``llvm/ADT/StringRef.h`` (`doxygen
-<http://llvm.org/doxygen/StringRef_8h_source.html>`__) for more
+<https://llvm.org/doxygen/StringRef_8h_source.html>`__) for more
 information.
 
 You should rarely use the ``StringRef`` class directly, because it contains
@@ -243,7 +243,7 @@
 The ``Twine`` class
 ^^^^^^^^^^^^^^^^^^^
 
-The ``Twine`` (`doxygen <http://llvm.org/doxygen/classllvm_1_1Twine.html>`__)
+The ``Twine`` (`doxygen <https://llvm.org/doxygen/classllvm_1_1Twine.html>`__)
 class is an efficient way for APIs to accept concatenated strings.  For example,
 a common LLVM paradigm is to name one instruction based on the name of another
 instruction with a suffix, for example:
@@ -261,7 +261,7 @@
 rendered directly into a character array.  This avoids unnecessary heap
 allocation involved in constructing the temporary results of string
 concatenation.  See ``llvm/ADT/Twine.h`` (`doxygen
-<http://llvm.org/doxygen/Twine_8h_source.html>`__) and :ref:`here <dss_twine>`
+<https://llvm.org/doxygen/Twine_8h_source.html>`__) and :ref:`here <dss_twine>`
 for more information.
 
 As with a ``StringRef``, ``Twine`` objects point to external memory and should
@@ -1056,7 +1056,7 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``function_ref``
-(`doxygen <http://llvm.org/doxygen/classllvm_1_1function__ref_3_01Ret_07Params_8_8_8_08_4.html>`__) class
+(`doxygen <https://llvm.org/doxygen/classllvm_1_1function__ref_3_01Ret_07Params_8_8_8_08_4.html>`__) class
 template represents a reference to a callable object, templated over the type
 of the callable. This is a good choice for passing a callback to a function,
 if you don't need to hold onto the callback after the function returns. In this
@@ -1106,7 +1106,7 @@
 them out, allowing you to enable them if you need them in the future.
 
 The ``llvm/Support/Debug.h`` (`doxygen
-<http://llvm.org/doxygen/Debug_8h_source.html>`__) file provides a macro named
+<https://llvm.org/doxygen/Debug_8h_source.html>`__) file provides a macro named
 ``LLVM_DEBUG()`` that is a much nicer solution to this problem.  Basically, you can
 put arbitrary code into the argument of the ``LLVM_DEBUG`` macro, and it is only
 executed if '``opt``' (or any other tool) is run with the '``-debug``' command
@@ -1203,7 +1203,7 @@
 -------------------------------------------
 
 The ``llvm/ADT/Statistic.h`` (`doxygen
-<http://llvm.org/doxygen/Statistic_8h_source.html>`__) file provides a class
+<https://llvm.org/doxygen/Statistic_8h_source.html>`__) file provides a class
 named ``Statistic`` that is used as a unified way to keep track of what the LLVM
 compiler is doing and how effective various optimizations are.  It is useful to
 see what optimizations are contributing to making a particular program run
@@ -1298,7 +1298,7 @@
 certain number of times.
 
 The ``llvm/Support/DebugCounter.h`` (`doxygen
-<http://llvm.org/doxygen/DebugCounter_8h_source.html>`__) file
+<https://llvm.org/doxygen/DebugCounter_8h_source.html>`__) file
 provides a class named ``DebugCounter`` that can be used to create
 command line counter options that control execution of parts of your code.
 
@@ -2513,7 +2513,7 @@
 ``BasicBlock``\ s and then that ``BasicBlock``'s ``Instruction``\ s,
 ``InstIterator`` should be used instead.  You'll need to include
 ``llvm/IR/InstIterator.h`` (`doxygen
-<http://llvm.org/doxygen/InstIterator_8h.html>`__) and then instantiate
+<https://llvm.org/doxygen/InstIterator_8h.html>`__) and then instantiate
 ``InstIterator``\ s explicitly in your code.  Here's a small example that shows
 how to dump all instructions in a function to the standard error stream:
 
@@ -2664,7 +2664,7 @@
 ``InvokeInst``\ s the same way, even though their most-specific common base
 class is ``Instruction``, which includes lots of less closely-related things.
 For these cases, LLVM provides a handy wrapper class called ``CallSite``
-(`doxygen <http://llvm.org/doxygen/classllvm_1_1CallSite.html>`__) It is
+(`doxygen <https://llvm.org/doxygen/classllvm_1_1CallSite.html>`__) It is
 essentially a wrapper around an ``Instruction`` pointer, with some methods that
 provide functionality common to ``CallInst``\ s and ``InvokeInst``\ s.
 
@@ -2680,7 +2680,7 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Frequently, we might have an instance of the ``Value`` class (`doxygen
-<http://llvm.org/doxygen/classllvm_1_1Value.html>`__) and we want to determine
+<https://llvm.org/doxygen/classllvm_1_1Value.html>`__) and we want to determine
 which ``User``\ s use the ``Value``.  The list of all ``User``\ s of a particular
 ``Value`` is called a *def-use* chain.  For example, let's say we have a
 ``Function*`` named ``F`` to a particular function ``foo``.  Finding all of the
@@ -2698,7 +2698,7 @@
     }
 
 Alternatively, it's common to have an instance of the ``User`` Class (`doxygen
-<http://llvm.org/doxygen/classllvm_1_1User.html>`__) and need to know what
+<https://llvm.org/doxygen/classllvm_1_1User.html>`__) and need to know what
 ``Value``\ s are used by it.  The list of all ``Value``\ s used by a ``User`` is
 known as a *use-def* chain.  Instances of class ``Instruction`` are common
 ``User`` s, so we might want to iterate over all of the values that a particular
@@ -2770,7 +2770,7 @@
 integer in the current stack frame, at run time.  Each ``Instruction`` subclass
 is likely to have varying default parameters which change the semantics of the
 instruction, so refer to the `doxygen documentation for the subclass of
-Instruction <http://llvm.org/doxygen/classllvm_1_1Instruction.html>`_ that
+Instruction <https://llvm.org/doxygen/classllvm_1_1Instruction.html>`_ that
 you're interested in instantiating.
 
 *Naming values*
@@ -2928,7 +2928,7 @@
 """""""""""""""""""""""""""""""""
 
 Including "`llvm/Transforms/Utils/BasicBlockUtils.h
-<http://llvm.org/doxygen/BasicBlockUtils_8h_source.html>`_" permits use of two
+<https://llvm.org/doxygen/BasicBlockUtils_8h_source.html>`_" permits use of two
 very useful replace functions: ``ReplaceInstWithValue`` and
 ``ReplaceInstWithInst``.
 
@@ -2974,8 +2974,8 @@
 
 You can use ``Value::replaceAllUsesWith`` and ``User::replaceUsesOfWith`` to
 change more than one use at a time.  See the doxygen documentation for the
-`Value Class <http://llvm.org/doxygen/classllvm_1_1Value.html>`_ and `User Class
-<http://llvm.org/doxygen/classllvm_1_1User.html>`_, respectively, for more
+`Value Class <https://llvm.org/doxygen/classllvm_1_1Value.html>`_ and `User Class
+<https://llvm.org/doxygen/classllvm_1_1User.html>`_, respectively, for more
 information.
 
 .. _schanges_deletingGV:
@@ -3103,7 +3103,7 @@
 ------------------------------
 
 The ``ValueSymbolTable`` (`doxygen
-<http://llvm.org/doxygen/classllvm_1_1ValueSymbolTable.html>`__) class provides
+<https://llvm.org/doxygen/classllvm_1_1ValueSymbolTable.html>`__) class provides
 a symbol table that the :ref:`Function <c_Function>` and Module_ classes use for
 naming value definitions.  The symbol table can provide a name for any Value_.
 
@@ -3124,10 +3124,10 @@
 The ``User`` and owned ``Use`` classes' memory layout
 -----------------------------------------------------
 
-The ``User`` (`doxygen <http://llvm.org/doxygen/classllvm_1_1User.html>`__)
+The ``User`` (`doxygen <https://llvm.org/doxygen/classllvm_1_1User.html>`__)
 class provides a basis for expressing the ownership of ``User`` towards other
-`Value instance <http://llvm.org/doxygen/classllvm_1_1Value.html>`_\ s.  The
-``Use`` (`doxygen <http://llvm.org/doxygen/classllvm_1_1Use.html>`__) helper
+`Value instance <https://llvm.org/doxygen/classllvm_1_1Value.html>`_\ s.  The
+``Use`` (`doxygen <https://llvm.org/doxygen/classllvm_1_1Use.html>`__) helper
 class is employed to do the bookkeeping and to facilitate *O(1)* addition and
 removal.
 
@@ -3414,9 +3414,9 @@
 
 ``#include "llvm/IR/Type.h"``
 
-header source: `Type.h <http://llvm.org/doxygen/Type_8h_source.html>`_
+header source: `Type.h <https://llvm.org/doxygen/Type_8h_source.html>`_
 
-doxygen info: `Type Classes <http://llvm.org/doxygen/classllvm_1_1Type.html>`_
+doxygen info: `Type Classes <https://llvm.org/doxygen/classllvm_1_1Type.html>`_
 
 The Core LLVM classes are the primary means of representing the program being
 inspected or transformed.  The core LLVM classes are defined in header files in
@@ -3518,9 +3518,9 @@
 
 ``#include "llvm/IR/Module.h"``
 
-header source: `Module.h <http://llvm.org/doxygen/Module_8h_source.html>`_
+header source: `Module.h <https://llvm.org/doxygen/Module_8h_source.html>`_
 
-doxygen info: `Module Class <http://llvm.org/doxygen/classllvm_1_1Module.html>`_
+doxygen info: `Module Class <https://llvm.org/doxygen/classllvm_1_1Module.html>`_
 
 The ``Module`` class represents the top level structure present in LLVM
 programs.  An LLVM module is effectively either a translation unit of the
@@ -3611,9 +3611,9 @@
 
 ``#include "llvm/IR/Value.h"``
 
-header source: `Value.h <http://llvm.org/doxygen/Value_8h_source.html>`_
+header source: `Value.h <https://llvm.org/doxygen/Value_8h_source.html>`_
 
-doxygen info: `Value Class <http://llvm.org/doxygen/classllvm_1_1Value.html>`_
+doxygen info: `Value Class <https://llvm.org/doxygen/classllvm_1_1Value.html>`_
 
 The ``Value`` class is the most important class in the LLVM Source base.  It
 represents a typed value that may be used (among other things) as an operand to
@@ -3702,9 +3702,9 @@
 
 ``#include "llvm/IR/User.h"``
 
-header source: `User.h <http://llvm.org/doxygen/User_8h_source.html>`_
+header source: `User.h <https://llvm.org/doxygen/User_8h_source.html>`_
 
-doxygen info: `User Class <http://llvm.org/doxygen/classllvm_1_1User.html>`_
+doxygen info: `User Class <https://llvm.org/doxygen/classllvm_1_1User.html>`_
 
 Superclass: Value_
 
@@ -3749,10 +3749,10 @@
 ``#include "llvm/IR/Instruction.h"``
 
 header source: `Instruction.h
-<http://llvm.org/doxygen/Instruction_8h_source.html>`_
+<https://llvm.org/doxygen/Instruction_8h_source.html>`_
 
 doxygen info: `Instruction Class
-<http://llvm.org/doxygen/classllvm_1_1Instruction.html>`_
+<https://llvm.org/doxygen/classllvm_1_1Instruction.html>`_
 
 Superclasses: User_, Value_
 
@@ -3773,7 +3773,7 @@
 concrete sub-classes of ``Instruction`` that implement the instruction (for
 example BinaryOperator_ and CmpInst_).  Unfortunately, the use of macros in this
 file confuses doxygen, so these enum values don't show up correctly in the
-`doxygen output <http://llvm.org/doxygen/classllvm_1_1Instruction.html>`_.
+`doxygen output <https://llvm.org/doxygen/classllvm_1_1Instruction.html>`_.
 
 .. _s_Instruction:
 
@@ -3890,10 +3890,10 @@
 ``#include "llvm/IR/GlobalValue.h"``
 
 header source: `GlobalValue.h
-<http://llvm.org/doxygen/GlobalValue_8h_source.html>`_
+<https://llvm.org/doxygen/GlobalValue_8h_source.html>`_
 
 doxygen info: `GlobalValue Class
-<http://llvm.org/doxygen/classllvm_1_1GlobalValue.html>`_
+<https://llvm.org/doxygen/classllvm_1_1GlobalValue.html>`_
 
 Superclasses: Constant_, User_, Value_
 
@@ -3948,10 +3948,10 @@
 
 ``#include "llvm/IR/Function.h"``
 
-header source: `Function.h <http://llvm.org/doxygen/Function_8h_source.html>`_
+header source: `Function.h <https://llvm.org/doxygen/Function_8h_source.html>`_
 
 doxygen info: `Function Class
-<http://llvm.org/doxygen/classllvm_1_1Function.html>`_
+<https://llvm.org/doxygen/classllvm_1_1Function.html>`_
 
 Superclasses: GlobalValue_, Constant_, User_, Value_
 
@@ -4057,10 +4057,10 @@
 ``#include "llvm/IR/GlobalVariable.h"``
 
 header source: `GlobalVariable.h
-<http://llvm.org/doxygen/GlobalVariable_8h_source.html>`_
+<https://llvm.org/doxygen/GlobalVariable_8h_source.html>`_
 
 doxygen info: `GlobalVariable Class
-<http://llvm.org/doxygen/classllvm_1_1GlobalVariable.html>`_
+<https://llvm.org/doxygen/classllvm_1_1GlobalVariable.html>`_
 
 Superclasses: GlobalValue_, Constant_, User_, Value_
 
@@ -4115,10 +4115,10 @@
 ``#include "llvm/IR/BasicBlock.h"``
 
 header source: `BasicBlock.h
-<http://llvm.org/doxygen/BasicBlock_8h_source.html>`_
+<https://llvm.org/doxygen/BasicBlock_8h_source.html>`_
 
 doxygen info: `BasicBlock Class
-<http://llvm.org/doxygen/classllvm_1_1BasicBlock.html>`_
+<https://llvm.org/doxygen/classllvm_1_1BasicBlock.html>`_
 
 Superclass: Value_
 
diff --git a/llvm/docs/Proposals/GitHubMove.rst b/llvm/docs/Proposals/GitHubMove.rst
--- a/llvm/docs/Proposals/GitHubMove.rst
+++ b/llvm/docs/Proposals/GitHubMove.rst
@@ -319,7 +319,7 @@
   # direct SVN checkout
   svn co https://user@llvm.org/svn/llvm-project/llvm/trunk llvm
   # or using the read-only Git view, with git-svn
-  git clone http://llvm.org/git/llvm.git
+  git clone https://llvm.org/git/llvm.git
   cd llvm
   git svn init https://llvm.org/svn/llvm-project/llvm/trunk --username=<username>
   git config svn-remote.svn.fetch :refs/remotes/origin/master
@@ -381,29 +381,29 @@
 
 ::
 
-  svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm -r $REVISION
+  svn co https://llvm.org/svn/llvm-project/llvm/trunk llvm -r $REVISION
   cd llvm/tools
-  svn co http://llvm.org/svn/llvm-project/clang/trunk clang -r $REVISION
+  svn co https://llvm.org/svn/llvm-project/clang/trunk clang -r $REVISION
   cd ../projects
-  svn co http://llvm.org/svn/llvm-project/libcxx/trunk libcxx -r $REVISION
+  svn co https://llvm.org/svn/llvm-project/libcxx/trunk libcxx -r $REVISION
 
 Or using git-svn::
 
-  git clone http://llvm.org/git/llvm.git
+  git clone https://llvm.org/git/llvm.git
   cd llvm/
   git svn init https://llvm.org/svn/llvm-project/llvm/trunk --username=<username>
   git config svn-remote.svn.fetch :refs/remotes/origin/master
   git svn rebase -l
   git checkout `git svn find-rev -B r258109`
   cd tools
-  git clone http://llvm.org/git/clang.git
+  git clone https://llvm.org/git/clang.git
   cd clang/
   git svn init https://llvm.org/svn/llvm-project/clang/trunk --username=<username>
   git config svn-remote.svn.fetch :refs/remotes/origin/master
   git svn rebase -l
   git checkout `git svn find-rev -B r258109`
   cd ../../projects/
-  git clone http://llvm.org/git/libcxx.git
+  git clone https://llvm.org/git/libcxx.git
   cd libcxx
   git svn init https://llvm.org/svn/llvm-project/libcxx/trunk --username=<username>
   git config svn-remote.svn.fetch :refs/remotes/origin/master
diff --git a/llvm/docs/README.txt b/llvm/docs/README.txt
--- a/llvm/docs/README.txt
+++ b/llvm/docs/README.txt
@@ -5,7 +5,7 @@
 plaintext markup language (file extension `.rst`). While the
 reStructuredText documentation should be quite readable in source form, it
 is mostly meant to be processed by the Sphinx documentation generation
-system to create HTML pages which are hosted on <http://llvm.org/docs/> and
+system to create HTML pages which are hosted on <https://llvm.org/docs/> and
 updated after every commit. Manpage output is also supported, see below.
 
 If you instead would like to generate and view the HTML locally, install
@@ -17,7 +17,7 @@
     $BROWSER <build-dir>/docs//html/index.html
 
 The mapping between reStructuredText files and generated documentation is
-`docs/Foo.rst` <-> `<build-dir>/docs//html/Foo.html` <-> `http://llvm.org/docs/Foo.html`.
+`docs/Foo.rst` <-> `<build-dir>/docs//html/Foo.html` <-> `https://llvm.org/docs/Foo.html`.
 
 If you are interested in writing new documentation, you will want to read
 `SphinxQuickstartTemplate.rst` which will get you writing documentation
@@ -41,7 +41,7 @@
 `docs/CommandGuide/Foo.rst` <-> `<build-dir>/docs//man/Foo.1`.
 These .rst files are also included during HTML generation so they are also
 viewable online (as noted above) at e.g.
-`http://llvm.org/docs/CommandGuide/Foo.html`.
+`https://llvm.org/docs/CommandGuide/Foo.html`.
 
 Checking links
 ==============
diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst
--- a/llvm/docs/Reference.rst
+++ b/llvm/docs/Reference.rst
@@ -53,8 +53,8 @@
 API Reference
 -------------
 
-`Doxygen generated documentation <http://llvm.org/doxygen/>`_
-  (`classes <http://llvm.org/doxygen/inherits.html>`_)
+`Doxygen generated documentation <https://llvm.org/doxygen/>`_
+  (`classes <https://llvm.org/doxygen/inherits.html>`_)
 
 :doc:`HowToUseAttributes`
   Answers some questions about the new Attributes infrastructure.
diff --git a/llvm/docs/ReleaseProcess.rst b/llvm/docs/ReleaseProcess.rst
--- a/llvm/docs/ReleaseProcess.rst
+++ b/llvm/docs/ReleaseProcess.rst
@@ -115,7 +115,7 @@
    :local:
 
 Follow the `LNT Quick Start Guide
-<http://llvm.org/docs/lnt/quickstart.html>`__ link on how to set-up the
+<https://llvm.org/docs/lnt/quickstart.html>`__ link on how to set-up the
 test-suite
 
 The binary location you'll have to use for testing is inside the
@@ -160,7 +160,7 @@
 You should:
 
 * Download the previous release sources from
-  http://llvm.org/releases/download.html.
+  https://llvm.org/releases/download.html.
 
 * Run the test-release.sh script on ``final`` mode (change ``-rc 1`` to
   ``-final``).
@@ -190,7 +190,7 @@
 You should:
 
 * Download the current candidate sources from where the release manager points
-  you (ex. http://llvm.org/pre-releases/3.3/rc1/).
+  you (ex. https://llvm.org/pre-releases/3.3/rc1/).
 
 * Repeat the steps above with ``-rc 1``, ``-rc 2`` etc modes and run the
   test-suite the same way.
diff --git a/llvm/docs/SphinxQuickstartTemplate.rst b/llvm/docs/SphinxQuickstartTemplate.rst
--- a/llvm/docs/SphinxQuickstartTemplate.rst
+++ b/llvm/docs/SphinxQuickstartTemplate.rst
@@ -84,7 +84,7 @@
 Links
 =====
 
-You can format a link `like this <http://llvm.org/>`_. A more `sophisticated syntax`_ allows you to place the ``.. _`link text`: <URL>`` block
+You can format a link `like this <https://llvm.org/>`_. A more `sophisticated syntax`_ allows you to place the ``.. _`link text`: <URL>`` block
 pretty much anywhere else in the document. This is useful when linking to especially long URLs.
 
 .. _`sophisticated syntax`: http://en.wikipedia.org/wiki/LLVM
diff --git a/llvm/docs/TableGen/index.rst b/llvm/docs/TableGen/index.rst
--- a/llvm/docs/TableGen/index.rst
+++ b/llvm/docs/TableGen/index.rst
@@ -28,7 +28,7 @@
 
 The current major users of TableGen are :doc:`../CodeGenerator`
 and the
-`Clang diagnostics and attributes <http://clang.llvm.org/docs/UsersManual.html#controlling-errors-and-warnings>`_.
+`Clang diagnostics and attributes <https://clang.llvm.org/docs/UsersManual.html#controlling-errors-and-warnings>`_.
 
 Note that if you work on TableGen much, and use emacs or vim, that you can find
 an emacs "TableGen mode" and a vim language file in the ``llvm/utils/emacs`` and
diff --git a/llvm/docs/TestSuiteGuide.md b/llvm/docs/TestSuiteGuide.md
--- a/llvm/docs/TestSuiteGuide.md
+++ b/llvm/docs/TestSuiteGuide.md
@@ -19,7 +19,7 @@
    % mkdir venv
    % virtualenv venv
    % . venv/bin/activate
-   % pip install svn+http://llvm.org/svn/llvm-project/llvm/trunk/utils/lit
+   % pip install svn+https://llvm.org/svn/llvm-project/llvm/trunk/utils/lit
    % lit --version
    lit 0.8.0dev
    ```
@@ -279,7 +279,7 @@
 
 LNT is a set of client and server tools for continuously monitoring
 performance. You can find more information at
-[http://llvm.org/docs/lnt](http://llvm.org/docs/lnt). The official LNT instance
+[https://llvm.org/docs/lnt](https://llvm.org/docs/lnt). The official LNT instance
 of the LLVM project is hosted at [http://lnt.llvm.org](http://lnt.llvm.org).
 
 
@@ -348,7 +348,7 @@
 CMake allows to cross compile to a different target via toolchain files. More
 information can be found here:
 
-- [http://llvm.org/docs/lnt/tests.html#cross-compiling](http://llvm.org/docs/lnt/tests.html#cross-compiling)
+- [https://llvm.org/docs/lnt/tests.html#cross-compiling](https://llvm.org/docs/lnt/tests.html#cross-compiling)
 
 - [https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html](https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html)
 
@@ -389,7 +389,7 @@
 
 The LNT tool can run the test-suite. Use this when submitting test results to
 an LNT instance.  See
-[http://llvm.org/docs/lnt/tests.html#llvm-cmake-test-suite](http://llvm.org/docs/lnt/tests.html#llvm-cmake-test-suite)
+[https://llvm.org/docs/lnt/tests.html#llvm-cmake-test-suite](https://llvm.org/docs/lnt/tests.html#llvm-cmake-test-suite)
 for details.
 
 Running the test-suite via Makefiles (deprecated)
diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst
--- a/llvm/docs/TestingGuide.rst
+++ b/llvm/docs/TestingGuide.rst
@@ -129,7 +129,7 @@
 
     % cmake -DCMAKE_BUILD_TYPE="Release" -DLLVM_ENABLE_ASSERTIONS=On
 
-If you have `Clang <http://clang.llvm.org/>`_ checked out and built, you
+If you have `Clang <https://clang.llvm.org/>`_ checked out and built, you
 can run the LLVM and Clang tests simultaneously using:
 
 .. code-block:: bash
diff --git a/llvm/docs/TypeMetadata.rst b/llvm/docs/TypeMetadata.rst
--- a/llvm/docs/TypeMetadata.rst
+++ b/llvm/docs/TypeMetadata.rst
@@ -29,7 +29,7 @@
 An intrinsic, :ref:`llvm.type.test <type.test>`, is used to test whether a
 given pointer is associated with a type identifier.
 
-.. _control flow integrity: http://clang.llvm.org/docs/ControlFlowIntegrity.html
+.. _control flow integrity: https://clang.llvm.org/docs/ControlFlowIntegrity.html
 
 Representing Type Information using Type Metadata
 =================================================
@@ -160,7 +160,7 @@
 The `GlobalLayoutBuilder`_ class is responsible for laying out the globals
 efficiently to minimize the sizes of the underlying bitsets.
 
-.. _control flow integrity design document: http://clang.llvm.org/docs/ControlFlowIntegrityDesign.html
+.. _control flow integrity design document: https://clang.llvm.org/docs/ControlFlowIntegrityDesign.html
 
 :Example:
 
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -2,7 +2,7 @@
 ===========
 
 NOTE: If you are a user who is only interested in using an LLVM-based compiler,
-you should look into `Clang <http://clang.llvm.org>`_ instead. The
+you should look into `Clang <https://clang.llvm.org>`_ instead. The
 documentation here is intended for users who have a need to work with the
 intermediate LLVM representation.
 
@@ -71,7 +71,7 @@
 `How to build the C, C++, ObjC, and ObjC++ front end`__
    Instructions for building the clang front-end from source.
 
-   .. __: http://clang.llvm.org/get_started.html
+   .. __: https://clang.llvm.org/get_started.html
 
 :doc:`CoverageMappingFormat`
   This describes the format and encoding used for LLVM’s code coverage mapping.
diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst
--- a/llvm/docs/Vectorizers.rst
+++ b/llvm/docs/Vectorizers.rst
@@ -80,7 +80,7 @@
 
 See the Clang
 `language extensions
-<http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations>`_
+<https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations>`_
 for details.
 
 Diagnostics
@@ -133,7 +133,7 @@
 
 To ensure line and column numbers are produced include the command line options
 ``-gline-tables-only`` and ``-gcolumn-info``. See the Clang `user manual
-<http://clang.llvm.org/docs/UsersManual.html#options-to-emit-optimization-reports>`_
+<https://clang.llvm.org/docs/UsersManual.html#options-to-emit-optimization-reports>`_
 for details
 
 Features
diff --git a/llvm/docs/WritingAnLLVMPass.rst b/llvm/docs/WritingAnLLVMPass.rst
--- a/llvm/docs/WritingAnLLVMPass.rst
+++ b/llvm/docs/WritingAnLLVMPass.rst
@@ -17,7 +17,7 @@
 are, above all, a structuring technique for compiler code.
 
 All LLVM passes are subclasses of the `Pass
-<http://llvm.org/doxygen/classllvm_1_1Pass.html>`_ class, which implement
+<https://llvm.org/doxygen/classllvm_1_1Pass.html>`_ class, which implement
 functionality by overriding virtual methods inherited from ``Pass``.  Depending
 on how your pass works, you should inherit from the :ref:`ModulePass
 <writing-an-llvm-pass-ModulePass>` , :ref:`CallGraphSCCPass
@@ -98,8 +98,8 @@
   #include "llvm/Support/raw_ostream.h"
 
 Which are needed because we are writing a `Pass
-<http://llvm.org/doxygen/classllvm_1_1Pass.html>`_, we are operating on
-`Function <http://llvm.org/doxygen/classllvm_1_1Function.html>`_\ s, and we will
+<https://llvm.org/doxygen/classllvm_1_1Pass.html>`_, we are operating on
+`Function <https://llvm.org/doxygen/classllvm_1_1Function.html>`_\ s, and we will
 be doing some printing.
 
 Next we have:
@@ -336,7 +336,7 @@
 ---------------------------
 
 The most plain and boring type of pass is the "`ImmutablePass
-<http://llvm.org/doxygen/classllvm_1_1ImmutablePass.html>`_" class.  This pass
+<https://llvm.org/doxygen/classllvm_1_1ImmutablePass.html>`_" class.  This pass
 type is used for passes that do not have to be run, do not change state, and
 never need to be updated.  This is not a normal type of transformation or
 analysis, but can provide information about the current compiler configuration.
@@ -353,7 +353,7 @@
 The ``ModulePass`` class
 ------------------------
 
-The `ModulePass <http://llvm.org/doxygen/classllvm_1_1ModulePass.html>`_ class
+The `ModulePass <https://llvm.org/doxygen/classllvm_1_1ModulePass.html>`_ class
 is the most general of all superclasses that you can use.  Deriving from
 ``ModulePass`` indicates that your pass uses the entire program as a unit,
 referring to function bodies in no predictable order, or adding and removing
@@ -388,7 +388,7 @@
 ------------------------------
 
 The `CallGraphSCCPass
-<http://llvm.org/doxygen/classllvm_1_1CallGraphSCCPass.html>`_ is used by
+<https://llvm.org/doxygen/classllvm_1_1CallGraphSCCPass.html>`_ is used by
 passes that need to traverse the program bottom-up on the call graph (callees
 before callers).  Deriving from ``CallGraphSCCPass`` provides some mechanics
 for building and traversing the ``CallGraph``, but also allows the system to
@@ -460,7 +460,7 @@
 --------------------------
 
 In contrast to ``ModulePass`` subclasses, `FunctionPass
-<http://llvm.org/doxygen/classllvm_1_1Pass.html>`_ subclasses do have a
+<https://llvm.org/doxygen/classllvm_1_1Pass.html>`_ subclasses do have a
 predictable, local behavior that can be expected by the system.  All
 ``FunctionPass`` execute on each function in the program independent of all of
 the other functions in the program.  ``FunctionPass``\ es do not require that
@@ -498,7 +498,7 @@
 overlap with any other pass executions (thus it should be very fast).
 
 A good example of how this method should be used is the `LowerAllocations
-<http://llvm.org/doxygen/LowerAllocations_8cpp-source.html>`_ pass.  This pass
+<https://llvm.org/doxygen/LowerAllocations_8cpp-source.html>`_ pass.  This pass
 converts ``malloc`` and ``free`` instructions into platform dependent
 ``malloc()`` and ``free()`` function calls.  It uses the ``doInitialization``
 method to get a reference to the ``malloc`` and ``free`` functions that it
@@ -761,7 +761,7 @@
 By implementing the ``getAnalysisUsage`` method, the required and invalidated
 sets may be specified for your transformation.  The implementation should fill
 in the `AnalysisUsage
-<http://llvm.org/doxygen/classllvm_1_1AnalysisUsage.html>`_ object with
+<https://llvm.org/doxygen/classllvm_1_1AnalysisUsage.html>`_ object with
 information about which passes are required and not invalidated.  To do this, a
 pass may call any of the following methods on the ``AnalysisUsage`` object:
 
@@ -914,14 +914,14 @@
 <writing-an-llvm-pass-RegisterAnalysisGroup>`.
 
 As a concrete example of an Analysis Group in action, consider the
-`AliasAnalysis <http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_
+`AliasAnalysis <https://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_
 analysis group.  The default implementation of the alias analysis interface
-(the `basicaa <http://llvm.org/doxygen/structBasicAliasAnalysis.html>`_ pass)
+(the `basicaa <https://llvm.org/doxygen/structBasicAliasAnalysis.html>`_ pass)
 just does a few simple checks that don't require significant analysis to
 compute (such as: two different globals can never alias each other, etc).
 Passes that use the `AliasAnalysis
-<http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_ interface (for
-example the `gvn <http://llvm.org/doxygen/classllvm_1_1GVN.html>`_ pass), do not
+<https://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_ interface (for
+example the `gvn <https://llvm.org/doxygen/classllvm_1_1GVN.html>`_ pass), do not
 care which implementation of alias analysis is actually provided, they just use
 the designated interface.
 
@@ -963,7 +963,7 @@
 
 This just shows a class ``FancyAA`` that uses the ``INITIALIZE_AG_PASS`` macro
 both to register and to "join" the `AliasAnalysis
-<http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_ analysis group.
+<https://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_ analysis group.
 Every implementation of an analysis group should join using this macro.
 
 .. code-block:: c++
@@ -982,13 +982,13 @@
 default implementation available at all times for an Analysis Group to be used.
 Only default implementation can derive from ``ImmutablePass``.  Here we declare
 that the `BasicAliasAnalysis
-<http://llvm.org/doxygen/structBasicAliasAnalysis.html>`_ pass is the default
+<https://llvm.org/doxygen/structBasicAliasAnalysis.html>`_ pass is the default
 implementation for the interface.
 
 Pass Statistics
 ===============
 
-The `Statistic <http://llvm.org/doxygen/Statistic_8h_source.html>`_ class is
+The `Statistic <https://llvm.org/doxygen/Statistic_8h_source.html>`_ class is
 designed to be an easy way to expose various success metrics from passes.
 These statistics are printed at the end of a run, when the :option:`-stats`
 command line option is enabled on the command line.  See the :ref:`Statistics
@@ -999,8 +999,8 @@
 What PassManager does
 ---------------------
 
-The `PassManager <http://llvm.org/doxygen/PassManager_8h_source.html>`_ `class
-<http://llvm.org/doxygen/classllvm_1_1PassManager.html>`_ takes a list of
+The `PassManager <https://llvm.org/doxygen/PassManager_8h_source.html>`_ `class
+<https://llvm.org/doxygen/classllvm_1_1PassManager.html>`_ takes a list of
 passes, ensures their :ref:`prerequisites <writing-an-llvm-pass-interaction>`
 are set up correctly, and then schedules passes to run efficiently.  All of the
 LLVM tools that run passes use the PassManager for execution of these passes.
@@ -1030,7 +1030,7 @@
    touching the LLVM program representation for a single function at a time,
    instead of traversing the entire program.  It reduces the memory consumption
    of compiler, because, for example, only one `DominatorSet
-   <http://llvm.org/doxygen/classllvm_1_1DominatorSet.html>`_ needs to be
+   <https://llvm.org/doxygen/classllvm_1_1DominatorSet.html>`_ needs to be
    calculated at a time.  This also makes it possible to implement some
    :ref:`interesting enhancements <writing-an-llvm-pass-SMP>` in the future.
 
diff --git a/llvm/docs/index.rst b/llvm/docs/index.rst
--- a/llvm/docs/index.rst
+++ b/llvm/docs/index.rst
@@ -4,7 +4,7 @@
 .. warning::
 
    If you are using a released version of LLVM, see `the download page
-   <http://llvm.org/releases/>`_ to find your documentation.
+   <https://llvm.org/releases/>`_ to find your documentation.
 
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
@@ -27,7 +27,7 @@
 `Introduction to the LLVM Compiler`__
   Presentation providing a users introduction to LLVM.
 
-  .. __: http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html
+  .. __: https://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html
 
 `Intro to LLVM`__
   A chapter from the book "The Architecture of Open Source Applications" that
@@ -39,12 +39,12 @@
 `LLVM: A Compilation Framework for Lifelong Program Analysis & Transformation`__
   Design overview.
 
-  .. __: http://llvm.org/pubs/2004-01-30-CGO-LLVM.html
+  .. __: https://llvm.org/pubs/2004-01-30-CGO-LLVM.html
 
 `LLVM: An Infrastructure for Multi-Stage Optimization`__
   More details (quite old now).
 
-  .. __: http://llvm.org/pubs/2002-12-LattnerMSThesis.html
+  .. __: https://llvm.org/pubs/2002-12-LattnerMSThesis.html
 
 Documentation
 =============
diff --git a/llvm/docs/tutorial/BuildingAJIT1.rst b/llvm/docs/tutorial/BuildingAJIT1.rst
--- a/llvm/docs/tutorial/BuildingAJIT1.rst
+++ b/llvm/docs/tutorial/BuildingAJIT1.rst
@@ -320,4 +320,4 @@
        +-----------------------------+-----------------------------------------------+
 
 .. [3] See the ErrorHandling section in the LLVM Programmer's Manual
-       (http://llvm.org/docs/ProgrammersManual.html#error-handling)
+       (https://llvm.org/docs/ProgrammersManual.html#error-handling)
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst
@@ -718,7 +718,7 @@
 
 Here is the complete code listing for our running example. Because this
 uses the LLVM libraries, we need to link them in. To do this, we use the
-`llvm-config <http://llvm.org/cmds/llvm-config.html>`_ tool to inform
+`llvm-config <https://llvm.org/cmds/llvm-config.html>`_ tool to inform
 our makefile/command line about which options to use:
 
 .. code-block:: bash
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst
@@ -20,7 +20,7 @@
 need to use a version of this tutorial that matches your LLVM release:
 If you are using an official LLVM release, use the version of the
 documentation included with your release or on the `llvm.org releases
-page <http://llvm.org/releases/>`_.
+page <https://llvm.org/releases/>`_.
 
 Code Generation Setup
 =====================
@@ -90,7 +90,7 @@
 
 The ``Builder`` object is a helper object that makes it easy to generate
 LLVM instructions. Instances of the
-`IRBuilder <http://llvm.org/doxygen/IRBuilder_8h-source.html>`_
+`IRBuilder <https://llvm.org/doxygen/IRBuilder_8h-source.html>`_
 class template keep track of the current place to insert instructions
 and has methods to create new instructions.
 
@@ -549,7 +549,7 @@
 Here is the complete code listing for our running example, enhanced with
 the LLVM code generator. Because this uses the LLVM libraries, we need
 to link them in. To do this, we use the
-`llvm-config <http://llvm.org/cmds/llvm-config.html>`_ tool to inform
+`llvm-config <https://llvm.org/cmds/llvm-config.html>`_ tool to inform
 our makefile/command line about which options to use:
 
 .. code-block:: bash
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
@@ -98,7 +98,7 @@
 
    Due to the transition to the new PassManager infrastructure this tutorial
    is based on ``llvm::legacy::FunctionPassManager`` which can be found in
-   `LegacyPassManager.h <http://llvm.org/doxygen/classllvm_1_1legacy_1_1FunctionPassManager.html>`_.
+   `LegacyPassManager.h <https://llvm.org/doxygen/classllvm_1_1legacy_1_1FunctionPassManager.html>`_.
    For the purpose of the this tutorial the above should be used until
    the pass manager transition is complete.
 
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.rst
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.rst
@@ -213,7 +213,7 @@
     }
 
 To visualize the control flow graph, you can use a nifty feature of the
-LLVM '`opt <http://llvm.org/cmds/opt.html>`_' tool. If you put this LLVM
+LLVM '`opt <https://llvm.org/cmds/opt.html>`_' tool. If you put this LLVM
 IR into "t.ll" and run "``llvm-as < t.ll | opt -analyze -view-cfg``", `a
 window will pop up <../../ProgrammersManual.html#viewing-graphs-while-debugging-code>`_ and you'll
 see this graph:
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl08.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl08.rst
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl08.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl08.rst
@@ -23,7 +23,7 @@
 To specify the architecture that you want to target, we use a string
 called a "target triple". This takes the form
 ``<arch><sub>-<vendor>-<sys>-<abi>`` (see the `cross compilation docs
-<http://clang.llvm.org/docs/CrossCompilation.html#target-triple>`_).
+<https://clang.llvm.org/docs/CrossCompilation.html#target-triple>`_).
 
 As an example, we can see what clang thinks is our current target
 triple:
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
@@ -165,13 +165,13 @@
 ====================
 
 Similar to the ``IRBuilder`` class we have a
-`DIBuilder <http://llvm.org/doxygen/classllvm_1_1DIBuilder.html>`_ class
+`DIBuilder <https://llvm.org/doxygen/classllvm_1_1DIBuilder.html>`_ class
 that helps in constructing debug metadata for an LLVM IR file. It
 corresponds 1:1 similarly to ``IRBuilder`` and LLVM IR, but with nicer names.
 Using it does require that you be more familiar with DWARF terminology than
 you needed to be with ``IRBuilder`` and ``Instruction`` names, but if you
 read through the general documentation on the
-`Metadata Format <http://llvm.org/docs/SourceLevelDebugging.html>`_ it
+`Metadata Format <https://llvm.org/docs/SourceLevelDebugging.html>`_ it
 should be a little more clear. We'll be using this class to construct all
 of our IR level descriptions. Construction for it takes a module so we
 need to construct it shortly after we construct our module. We've left it
diff --git a/llvm/docs/tutorial/OCamlLangImpl3.rst b/llvm/docs/tutorial/OCamlLangImpl3.rst
--- a/llvm/docs/tutorial/OCamlLangImpl3.rst
+++ b/llvm/docs/tutorial/OCamlLangImpl3.rst
@@ -65,7 +65,7 @@
 
 The ``Codegen.builder`` object is a helper object that makes it easy to
 generate LLVM instructions. Instances of the
-`IRBuilder <http://llvm.org/doxygen/IRBuilder_8h-source.html>`_
+`IRBuilder <https://llvm.org/doxygen/IRBuilder_8h-source.html>`_
 class keep track of the current place to insert instructions and has
 methods to create new instructions.
 
@@ -522,7 +522,7 @@
 Here is the complete code listing for our running example, enhanced with
 the LLVM code generator. Because this uses the LLVM libraries, we need
 to link them in. To do this, we use the
-`llvm-config <http://llvm.org/cmds/llvm-config.html>`_ tool to inform
+`llvm-config <https://llvm.org/cmds/llvm-config.html>`_ tool to inform
 our makefile/command line about which options to use:
 
 .. code-block:: bash
diff --git a/llvm/docs/tutorial/OCamlLangImpl5.rst b/llvm/docs/tutorial/OCamlLangImpl5.rst
--- a/llvm/docs/tutorial/OCamlLangImpl5.rst
+++ b/llvm/docs/tutorial/OCamlLangImpl5.rst
@@ -161,7 +161,7 @@
     }
 
 To visualize the control flow graph, you can use a nifty feature of the
-LLVM '`opt <http://llvm.org/cmds/opt.html>`_' tool. If you put this LLVM
+LLVM '`opt <https://llvm.org/cmds/opt.html>`_' tool. If you put this LLVM
 IR into "t.ll" and run "``llvm-as < t.ll | opt -analyze -view-cfg``", `a
 window will pop up <../ProgrammersManual.html#viewing-graphs-while-debugging-code>`_ and you'll
 see this graph:
diff --git a/llvm/docs/tutorial/index.rst b/llvm/docs/tutorial/index.rst
--- a/llvm/docs/tutorial/index.rst
+++ b/llvm/docs/tutorial/index.rst
@@ -51,5 +51,5 @@
 Advanced Topics
 ===============
 
-#. `Writing an Optimization for LLVM <http://llvm.org/pubs/2004-09-22-LCPCLLVMTutorial.html>`_
+#. `Writing an Optimization for LLVM <https://llvm.org/pubs/2004-09-22-LCPCLLVMTutorial.html>`_
 
diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@@ -691,6 +691,10 @@
     return getArch() == Triple::nvptx || getArch() == Triple::nvptx64;
   }
 
+  bool isAMDGPU() const {
+    return getArch() == Triple::r600 || getArch() == Triple::amdgcn;
+  }
+
   /// Tests whether the target is Thumb (little and big endian).
   bool isThumb() const {
     return getArch() == Triple::thumb || getArch() == Triple::thumbeb;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -24,8 +24,6 @@
 #include <memory>
 #include <vector>
 
-#define DEBUG_TYPE "orc"
-
 namespace llvm {
 namespace orc {
 
@@ -309,66 +307,6 @@
 /// A map of Symbols to (Symbol, Flags) pairs.
 using SymbolAliasMap = DenseMap<SymbolStringPtr, SymbolAliasMapEntry>;
 
-/// Render a SymbolStringPtr.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym);
-
-/// Render a SymbolNameSet.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols);
-
-/// Render a SymbolNameVector.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols);
-
-/// Render a SymbolFlagsMap entry.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV);
-
-/// Render a SymbolMap entry.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV);
-
-/// Render a SymbolFlagsMap.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags);
-
-/// Render a SymbolMap.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols);
-
-/// Render a SymbolDependenceMap entry.
-raw_ostream &operator<<(raw_ostream &OS,
-                        const SymbolDependenceMap::value_type &KV);
-
-/// Render a SymbolDependendeMap.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps);
-
-/// Render a MaterializationUnit.
-raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU);
-
-//// Render a JITDylibLookupFlags instance.
-raw_ostream &operator<<(raw_ostream &OS,
-                        const JITDylibLookupFlags &JDLookupFlags);
-
-/// Rendar a SymbolLookupFlags instance.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags);
-
-/// Render a JITDylibLookupFlags instance.
-raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K);
-
-/// Render a SymbolLookupSet entry.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet::value_type &KV);
-
-/// Render a SymbolLookupSet.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet);
-
-/// Render a JITDylibSearchOrder.
-raw_ostream &operator<<(raw_ostream &OS,
-                        const JITDylibSearchOrder &SearchOrder);
-
-/// Render a SymbolAliasMap.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases);
-
-/// Render a SymbolState.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S);
-
-/// Render a LookupKind.
-raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K);
-
 /// Callback to notify client that symbols have been resolved.
 using SymbolsResolvedCallback = unique_function<void(Expected<SymbolMap>)>;
 
@@ -1301,11 +1239,8 @@
   /// Materialize the given unit.
   void dispatchMaterialization(JITDylib &JD,
                                std::unique_ptr<MaterializationUnit> MU) {
-    LLVM_DEBUG({
-      runSessionLocked([&]() {
-        dbgs() << "Dispatching " << *MU << " for " << JD.getName() << "\n";
-      });
-    });
+    assert(MU && "MU must be non-null");
+    DEBUG_WITH_TYPE("orc", dumpDispatchInfo(JD, *MU));
     DispatchMaterialization(JD, std::move(MU));
   }
 
@@ -1325,6 +1260,10 @@
 
   void runOutstandingMUs();
 
+#ifndef NDEBUG
+  void dumpDispatchInfo(JITDylib &JD, MaterializationUnit &MU);
+#endif // NDEBUG
+
   mutable std::recursive_mutex SessionMutex;
   std::shared_ptr<SymbolStringPool> SSP;
   std::unique_ptr<Platform> P;
@@ -1425,6 +1364,4 @@
 } // End namespace orc
 } // End namespace llvm
 
-#undef DEBUG_TYPE // "orc"
-
 #endif // LLVM_EXECUTIONENGINE_ORC_CORE_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
--- a/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
@@ -13,7 +13,11 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_DEBUGUTILS_H
 #define LLVM_EXECUTIONENGINE_ORC_DEBUGUTILS_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
 #include <memory>
 #include <string>
 
@@ -23,6 +27,71 @@
 
 namespace orc {
 
+// --raw_ostream operators for ORC types--
+
+/// Render a SymbolStringPtr.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym);
+
+/// Render a SymbolNameSet.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols);
+
+/// Render a SymbolNameVector.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols);
+
+/// Render JITSymbolFlags.
+raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags);
+
+/// Render a SymbolFlagsMap entry.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV);
+
+/// Render a SymbolMap entry.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV);
+
+/// Render a SymbolFlagsMap.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags);
+
+/// Render a SymbolMap.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols);
+
+/// Render a SymbolDependenceMap entry.
+raw_ostream &operator<<(raw_ostream &OS,
+                        const SymbolDependenceMap::value_type &KV);
+
+/// Render a SymbolDependendeMap.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps);
+
+/// Render a MaterializationUnit.
+raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU);
+
+//// Render a JITDylibLookupFlags instance.
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibLookupFlags &JDLookupFlags);
+
+/// Rendar a SymbolLookupFlags instance.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags);
+
+/// Render a JITDylibLookupFlags instance.
+raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K);
+
+/// Render a SymbolLookupSet entry.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet::value_type &KV);
+
+/// Render a SymbolLookupSet.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet);
+
+/// Render a JITDylibSearchOrder.
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibSearchOrder &SearchOrder);
+
+/// Render a SymbolAliasMap.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases);
+
+/// Render a SymbolState.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S);
+
+/// Render a LookupKind.
+raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K);
+
 /// A function object that can be used as an ObjectTransformLayer transform
 /// to dump object files to disk at a specified path.
 class DumpObjects {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
--- a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/PassBuilder.h"
diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -221,6 +221,7 @@
 public:
   void InitMCObjectFileInfo(const Triple &TT, bool PIC, MCContext &ctx,
                             bool LargeCodeModel = false);
+  MCContext &getContext() const { return *Ctx; }
 
   bool getSupportsWeakOmittedEHFrame() const {
     return SupportsWeakOmittedEHFrame;
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -37,8 +37,6 @@
 class TargetMachine;
 
 class TargetLoweringObjectFile : public MCObjectFileInfo {
-  MCContext *Ctx = nullptr;
-
   /// Name-mangler for global names.
   Mangler *Mang = nullptr;
 
@@ -67,7 +65,6 @@
   operator=(const TargetLoweringObjectFile &) = delete;
   virtual ~TargetLoweringObjectFile();
 
-  MCContext &getContext() const { return *Ctx; }
   Mangler &getMangler() const { return *Mang; }
 
   /// This method must be called before any actual lowering is done.  This
diff --git a/llvm/include/llvm/TextAPI/MachO/InterfaceFile.h b/llvm/include/llvm/TextAPI/MachO/InterfaceFile.h
--- a/llvm/include/llvm/TextAPI/MachO/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/MachO/InterfaceFile.h
@@ -278,11 +278,6 @@
     return ParentUmbrellas;
   }
 
-  /// Get the parent umbrella framework.
-  const std::vector<std::pair<Target, std::string>> getParentUmbrellas() const {
-    return ParentUmbrellas;
-  }
-
   /// Add an allowable client.
   ///
   /// Mach-O Dynamic libraries have the concept of allowable clients that are
diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
--- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -217,7 +217,6 @@
   void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM);
   void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS);
   void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM);
-  void addInstructionCombiningPass(legacy::PassManagerBase &MPM) const;
 
 public:
   /// populateFunctionPassManager - This fills in the function pass manager,
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
@@ -24,14 +24,13 @@
 
 class InstCombinePass : public PassInfoMixin<InstCombinePass> {
   InstCombineWorklist Worklist;
-  const bool ExpensiveCombines;
   const unsigned MaxIterations;
 
 public:
   static StringRef name() { return "InstCombinePass"; }
 
-  explicit InstCombinePass(bool ExpensiveCombines = true);
-  explicit InstCombinePass(bool ExpensiveCombines, unsigned MaxIterations);
+  explicit InstCombinePass();
+  explicit InstCombinePass(unsigned MaxIterations);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
@@ -42,15 +41,13 @@
 /// will try to combine all instructions in the function.
 class InstructionCombiningPass : public FunctionPass {
   InstCombineWorklist Worklist;
-  const bool ExpensiveCombines;
   const unsigned MaxIterations;
 
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  explicit InstructionCombiningPass(bool ExpensiveCombines = true);
-  explicit InstructionCombiningPass(bool ExpensiveCombines,
-                                    unsigned MaxIterations);
+  explicit InstructionCombiningPass();
+  explicit InstructionCombiningPass(unsigned MaxIterations);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnFunction(Function &F) override;
@@ -68,9 +65,8 @@
 // into:
 //    %Z = add int 2, %X
 //
-FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines = true);
-FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines,
-                                             unsigned MaxIterations);
+FunctionPass *createInstructionCombiningPass();
+FunctionPass *createInstructionCombiningPass(unsigned MaxIterations);
 }
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -23,12 +23,13 @@
 class MemSetInst;
 class TargetTransformInfo;
 class Value;
+struct Align;
 
 /// Emit a loop implementing the semantics of llvm.memcpy where the size is not
 /// a compile-time constant. Loop will be insterted at \p InsertBefore.
 void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr,
                                  Value *DstAddr, Value *CopyLen,
-                                 unsigned SrcAlign, unsigned DestAlign,
+                                 Align SrcAlign, Align DestAlign,
                                  bool SrcIsVolatile, bool DstIsVolatile,
                                  const TargetTransformInfo &TTI);
 
@@ -36,11 +37,10 @@
 /// compile time constant. Loop is inserted at \p InsertBefore.
 void createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                                Value *DstAddr, ConstantInt *CopyLen,
-                               unsigned SrcAlign, unsigned DestAlign,
+                               Align SrcAlign, Align DestAlign,
                                bool SrcIsVolatile, bool DstIsVolatile,
                                const TargetTransformInfo &TTI);
 
-
 /// Expand \p MemCpy as a loop. \p MemCpy is not deleted.
 void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI);
 
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1278,11 +1278,11 @@
 
 static ValueLatticeElement
 getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest,
-                      DenseMap<Value*, ValueLatticeElement> &Visited);
+                      SmallDenseMap<Value*, ValueLatticeElement> &Visited);
 
 static ValueLatticeElement
 getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest,
-                          DenseMap<Value*, ValueLatticeElement> &Visited) {
+                          SmallDenseMap<Value*, ValueLatticeElement> &Visited) {
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(Cond))
     return getValueFromICmpCondition(Val, ICI, isTrueDest);
 
@@ -1315,7 +1315,7 @@
 
 static ValueLatticeElement
 getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest,
-                      DenseMap<Value*, ValueLatticeElement> &Visited) {
+                      SmallDenseMap<Value*, ValueLatticeElement> &Visited) {
   auto I = Visited.find(Cond);
   if (I != Visited.end())
     return I->second;
@@ -1328,7 +1328,7 @@
 ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond,
                                           bool isTrueDest) {
   assert(Cond && "precondition");
-  DenseMap<Value*, ValueLatticeElement> Visited;
+  SmallDenseMap<Value*, ValueLatticeElement> Visited;
   return getValueFromCondition(Val, Cond, isTrueDest, Visited);
 }
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -105,14 +105,12 @@
   TLI.setShouldExtI32Return(ShouldExtI32Return);
   TLI.setShouldSignExtI32Param(ShouldSignExtI32Param);
 
-  if (T.getArch() == Triple::r600 ||
-      T.getArch() == Triple::amdgcn)
+  if (T.isAMDGPU())
     TLI.disableAllFunctions();
 
   // There are no library implementations of memcpy and memset for AMD gpus and
   // these can be difficult to lower in the backend.
-  if (T.getArch() == Triple::r600 ||
-      T.getArch() == Triple::amdgcn) {
+  if (T.isAMDGPU()) {
     TLI.setUnavailable(LibFunc_memcpy);
     TLI.setUnavailable(LibFunc_memset);
     TLI.setUnavailable(LibFunc_memset_pattern16);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1737,7 +1737,12 @@
     }
     break;
   case Instruction::ShuffleVector: {
-    auto *Shuf = cast<ShuffleVectorInst>(I);
+    auto *Shuf = dyn_cast<ShuffleVectorInst>(I);
+    // FIXME: Do we need to handle ConstantExpr involving shufflevectors?
+    if (!Shuf) {
+      Known.resetAll();
+      return;
+    }
     // For undef elements, we don't know anything about the common state of
     // the shuffle result.
     APInt DemandedLHS, DemandedRHS;
@@ -1763,10 +1768,9 @@
     break;
   }
   case Instruction::InsertElement: {
-    auto *IEI = cast<InsertElementInst>(I);
-    Value *Vec = IEI->getOperand(0);
-    Value *Elt = IEI->getOperand(1);
-    auto *CIdx = dyn_cast<ConstantInt>(IEI->getOperand(2));
+    const Value *Vec = I->getOperand(0);
+    const Value *Elt = I->getOperand(1);
+    auto *CIdx = dyn_cast<ConstantInt>(I->getOperand(2));
     // Early out if the index is non-constant or out-of-range.
     unsigned NumElts = DemandedElts.getBitWidth();
     if (!CIdx || CIdx->getValue().uge(NumElts)) {
@@ -1796,9 +1800,8 @@
   case Instruction::ExtractElement: {
     // Look through extract element. If the index is non-constant or
     // out-of-range demand all elements, otherwise just the extracted element.
-    auto* EEI = cast<ExtractElementInst>(I);
-    const Value* Vec = EEI->getVectorOperand();
-    const Value* Idx = EEI->getIndexOperand();
+    const Value *Vec = I->getOperand(0);
+    const Value *Idx = I->getOperand(1);
     auto *CIdx = dyn_cast<ConstantInt>(Idx);
     unsigned NumElts = Vec->getType()->getVectorNumElements();
     APInt DemandedVecElts = APInt::getAllOnesValue(NumElts);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2090,21 +2090,10 @@
   // init priority.
   if (!isa<ConstantArray>(List)) return;
 
-  // Sanity check the structors list.
-  const ConstantArray *InitList = dyn_cast<ConstantArray>(List);
-  if (!InitList) return; // Not an array!
-  StructType *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
-  if (!ETy || ETy->getNumElements() != 3 ||
-      !isa<IntegerType>(ETy->getTypeAtIndex(0U)) ||
-      !isa<PointerType>(ETy->getTypeAtIndex(1U)) ||
-      !isa<PointerType>(ETy->getTypeAtIndex(2U)))
-    return; // Not (int, ptr, ptr).
-
   // Gather the structors in a form that's convenient for sorting by priority.
   SmallVector<Structor, 8> Structors;
-  for (Value *O : InitList->operands()) {
-    ConstantStruct *CS = dyn_cast<ConstantStruct>(O);
-    if (!CS) continue; // Malformed.
+  for (Value *O : cast<ConstantArray>(List)->operands()) {
+    auto *CS = cast<ConstantStruct>(O);
     if (CS->getOperand(1)->isNullValue())
       break;  // Found a null terminator, skip the rest.
     ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1390,11 +1390,12 @@
   if (!DstTy.isScalar())
     return UnableToLegalize;
 
-  if (WideTy.getSizeInBits() == SrcTy.getSizeInBits()) {
+  if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
     if (SrcTy.isPointer()) {
       const DataLayout &DL = MIRBuilder.getDataLayout();
       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
-        LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
+        LLVM_DEBUG(
+            dbgs() << "Not casting non-integral address space integer\n");
         return UnableToLegalize;
       }
 
@@ -1402,6 +1403,14 @@
       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
     }
 
+    // Widen SrcTy to WideTy. This does not affect the result, but since the
+    // user requested this size, it is probably better handled than SrcTy and
+    // should reduce the total number of legalization artifacts
+    if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
+      SrcTy = WideTy;
+      SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
+    }
+
     // Theres no unmerge type to target. Directly extract the bits from the
     // source type
     unsigned DstSize = DstTy.getSizeInBits();
@@ -1417,10 +1426,6 @@
     return Legalized;
   }
 
-  // TODO
-  if (WideTy.getSizeInBits() > SrcTy.getSizeInBits())
-    return UnableToLegalize;
-
   // Extend the source to a wider type.
   LLT LCMTy = getLCMType(SrcTy, WideTy);
 
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -130,7 +130,7 @@
                                            const Function &F) {
   if (F.hasFnAttribute(Attribute::StackAlignment))
     return F.getFnStackAlignment();
-  return STI->getFrameLowering()->getStackAlignment();
+  return STI->getFrameLowering()->getStackAlign().value();
 }
 
 MachineFunction::MachineFunction(const Function &F,
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -438,14 +438,12 @@
       unsigned Size = RegInfo->getSpillSize(*RC);
       if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
         // Nope, just spill it anywhere convenient.
-        unsigned Align = RegInfo->getSpillAlignment(*RC);
-        unsigned StackAlign = TFI->getStackAlignment();
-
+        Align Alignment(RegInfo->getSpillAlignment(*RC));
         // We may not be able to satisfy the desired alignment specification of
         // the TargetRegisterClass if the stack alignment is smaller. Use the
         // min.
-        Align = std::min(Align, StackAlign);
-        FrameIdx = MFI.CreateStackObject(Size, Align, true);
+        Alignment = std::min(Alignment, TFI->getStackAlign());
+        FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
         if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
         if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
       } else {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11957,6 +11957,7 @@
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+  bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
 
   // Is the node an FMUL and contractable either due to global flags or
   // SDNodeFlags.
@@ -12120,7 +12121,7 @@
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
     if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N1.getOperand(2)) &&
-        N1->hasOneUse()) {
+        N1->hasOneUse() && NoSignedZero) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -14228,118 +14229,142 @@
   return true;
 }
 
-/// Try to combine a load/store with a add/sub of the base pointer node into a
-/// post-indexed load/store. The transformation folded the add/subtract into the
-/// new indexed load/store effectively and all of its uses are redirected to the
-/// new load/store.
-bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
-  if (Level < AfterLegalizeDAG)
+static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
+                                   SDValue &BasePtr, SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG,
+                                   const TargetLowering &TLI) {
+  if (PtrUse == N ||
+      (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB))
     return false;
 
-  bool IsLoad = true;
-  bool IsMasked = false;
-  SDValue Ptr;
-  if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked,
-                                Ptr, TLI))
+  if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG))
     return false;
 
-  if (Ptr.getNode()->hasOneUse())
+  // Don't create a indexed load / store with zero offset.
+  if (isNullConstant(Offset))
     return false;
 
-  for (SDNode *Op : Ptr.getNode()->uses()) {
-    if (Op == N ||
-        (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
-      continue;
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+    return false;
 
-    SDValue BasePtr;
-    SDValue Offset;
-    ISD::MemIndexedMode AM = ISD::UNINDEXED;
-    if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
-      // Don't create a indexed load / store with zero offset.
-      if (isNullConstant(Offset))
-        continue;
+  SmallPtrSet<const SDNode *, 32> Visited;
+  for (SDNode *Use : BasePtr.getNode()->uses()) {
+    if (Use == Ptr.getNode())
+      continue;
 
-      // Try turning it into a post-indexed load / store except when
-      // 1) All uses are load / store ops that use it as base ptr (and
-      //    it may be folded as addressing mmode).
-      // 2) Op must be independent of N, i.e. Op is neither a predecessor
-      //    nor a successor of N. Otherwise, if Op is folded that would
-      //    create a cycle.
+    // No if there's a later user which could perform the index instead.
+    if (isa<MemSDNode>(Use)) {
+      bool IsLoad = true;
+      bool IsMasked = false;
+      SDValue OtherPtr;
+      if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
+                                   IsMasked, OtherPtr, TLI)) {
+        SmallVector<const SDNode *, 2> Worklist;
+        Worklist.push_back(Use);
+        if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
+          return false;
+      }
+    }
 
-      if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
-        continue;
+    // If all the uses are load / store addresses, then don't do the
+    // transformation.
+    if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
+      for (SDNode *UseUse : Use->uses())
+        if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))
+          return false;
+    }
+  }
+  return true;
+}
 
-      // Check for #1.
-      bool TryNext = false;
-      for (SDNode *Use : BasePtr.getNode()->uses()) {
-        if (Use == Ptr.getNode())
-          continue;
+static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
+                                         bool &IsMasked, SDValue &Ptr,
+                                         SDValue &BasePtr, SDValue &Offset,
+                                         ISD::MemIndexedMode &AM,
+                                         SelectionDAG &DAG,
+                                         const TargetLowering &TLI) {
+  if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
+                                IsMasked, Ptr, TLI) ||
+      Ptr.getNode()->hasOneUse())
+    return nullptr;
+
+  // Try turning it into a post-indexed load / store except when
+  // 1) All uses are load / store ops that use it as base ptr (and
+  //    it may be folded as addressing mmode).
+  // 2) Op must be independent of N, i.e. Op is neither a predecessor
+  //    nor a successor of N. Otherwise, if Op is folded that would
+  //    create a cycle.
+  for (SDNode *Op : Ptr->uses()) {
+    // Check for #1.
+    if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
+      continue;
 
-        // If all the uses are load / store addresses, then don't do the
-        // transformation.
-        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
-          bool RealUse = false;
-          for (SDNode *UseUse : Use->uses()) {
-            if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
-              RealUse = true;
-          }
+    // Check for #2.
+    SmallPtrSet<const SDNode *, 32> Visited;
+    SmallVector<const SDNode *, 8> Worklist;
+    // Ptr is predecessor to both N and Op.
+    Visited.insert(Ptr.getNode());
+    Worklist.push_back(N);
+    Worklist.push_back(Op);
+    if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
+        !SDNode::hasPredecessorHelper(Op, Visited, Worklist))
+      return Op;
+  }
+  return nullptr;
+}
 
-          if (!RealUse) {
-            TryNext = true;
-            break;
-          }
-        }
-      }
+/// Try to combine a load/store with a add/sub of the base pointer node into a
+/// post-indexed load/store. The transformation folded the add/subtract into the
+/// new indexed load/store effectively and all of its uses are redirected to the
+/// new load/store.
+bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
 
-      if (TryNext)
-        continue;
+  bool IsLoad = true;
+  bool IsMasked = false;
+  SDValue Ptr;
+  SDValue BasePtr;
+  SDValue Offset;
+  ISD::MemIndexedMode AM = ISD::UNINDEXED;
+  SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr,
+                                         Offset, AM, DAG, TLI);
+  if (!Op)
+    return false;
 
-      // Check for #2.
-      SmallPtrSet<const SDNode *, 32> Visited;
-      SmallVector<const SDNode *, 8> Worklist;
-      // Ptr is predecessor to both N and Op.
-      Visited.insert(Ptr.getNode());
-      Worklist.push_back(N);
-      Worklist.push_back(Op);
-      if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
-          !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) {
-        SDValue Result;
-        if (!IsMasked)
-          Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
-                                               Offset, AM)
-                          : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
+  SDValue Result;
+  if (!IsMasked)
+    Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
+                                         Offset, AM)
+                    : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
+                                          BasePtr, Offset, AM);
+  else
+    Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
+                                               BasePtr, Offset, AM)
+                    : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
                                                 BasePtr, Offset, AM);
-        else
-          Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
-                                                     BasePtr, Offset, AM)
-                          : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
-                                                      BasePtr, Offset, AM);
-        ++PostIndexedNodes;
-        ++NodesCombined;
-        LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
-                   dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
-                   dbgs() << '\n');
-        WorklistRemover DeadNodes(*this);
-        if (IsLoad) {
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
-        } else {
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
-        }
-
-        // Finally, since the node is now dead, remove it from the graph.
-        deleteAndRecombine(N);
-
-        // Replace the uses of Use with uses of the updated base value.
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
-                                      Result.getValue(IsLoad ? 1 : 0));
-        deleteAndRecombine(Op);
-        return true;
-      }
-    }
+  ++PostIndexedNodes;
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
+             dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
+             dbgs() << '\n');
+  WorklistRemover DeadNodes(*this);
+  if (IsLoad) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
+  } else {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
   }
 
-  return false;
+  // Finally, since the node is now dead, remove it from the graph.
+  deleteAndRecombine(N);
+
+  // Replace the uses of Use with uses of the updated base value.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
+                                Result.getValue(IsLoad ? 1 : 0));
+  deleteAndRecombine(Op);
+  return true;
 }
 
 /// Return the base-pointer arithmetic from an indexed \p LD.
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -85,7 +85,6 @@
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
-  unsigned StackAlign = TFI->getStackAlignment();
   DA = DAG->getDivergenceAnalysis();
 
   // Check whether the function can return without sret-demotion.
@@ -130,19 +129,19 @@
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
   // them.
+  const Align StackAlign = TFI->getStackAlign();
   for (const BasicBlock &BB : *Fn) {
     for (const Instruction &I : BB) {
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         Type *Ty = AI->getAllocatedType();
-        unsigned Align =
-          std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty),
-                   AI->getAlignment());
+        Align Alignment =
+            max(MF->getDataLayout().getPrefTypeAlign(Ty), AI->getAlign());
 
         // Static allocas can be folded into the initial stack frame
         // adjustment. For targets that don't realign the stack, don't
         // do this if there is an extra alignment requirement.
         if (AI->isStaticAlloca() &&
-            (TFI->isStackRealignable() || (Align <= StackAlign))) {
+            (TFI->isStackRealignable() || (Alignment <= StackAlign))) {
           const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize());
           uint64_t TySize =
               MF->getDataLayout().getTypeAllocSize(Ty).getKnownMinSize();
@@ -154,10 +153,10 @@
           if (Iter != CatchObjects.end() && TLI->needsFixedCatchObjects()) {
             FrameIndex = MF->getFrameInfo().CreateFixedObject(
                 TySize, 0, /*IsImmutable=*/false, /*isAliased=*/true);
-            MF->getFrameInfo().setObjectAlignment(FrameIndex, Align);
+            MF->getFrameInfo().setObjectAlignment(FrameIndex, Alignment);
           } else {
-            FrameIndex =
-                MF->getFrameInfo().CreateStackObject(TySize, Align, false, AI);
+            FrameIndex = MF->getFrameInfo().CreateStackObject(TySize, Alignment,
+                                                              false, AI);
           }
 
           // Scalable vectors may need a special StackID to distinguish
@@ -176,10 +175,9 @@
           // FIXME: Overaligned static allocas should be grouped into
           // a single dynamic allocation instead of using a separate
           // stack allocation for each one.
-          if (Align <= StackAlign)
-            Align = 0;
           // Inform the Frame Information that we have variable-sized objects.
-          MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, AI);
+          MF->getFrameInfo().CreateVariableSizedObject(
+              Alignment <= StackAlign ? 0 : Alignment.value(), AI);
         }
       }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4048,13 +4048,10 @@
     return FirstAnswer;
   }
 
-  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+  // Okay, we know that the sign bit in Mask is set.  Use CLO to determine
   // the number of identical bits in the top of the input value.
-  Mask = ~Mask;
   Mask <<= Mask.getBitWidth()-VTBits;
-  // Return # leading zeros.  We use 'min' here in case Val was zero before
-  // shifting.  We don't want to return '64' as for an i32 "0".
-  return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
+  return std::max(FirstAnswer, Mask.countLeadingOnes());
 }
 
 bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4010,8 +4010,7 @@
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   auto &DL = DAG.getDataLayout();
   uint64_t TySize = DL.getTypeAllocSize(Ty);
-  unsigned Align =
-      std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment());
+  MaybeAlign Alignment = max(DL.getPrefTypeAlign(Ty), I.getAlign());
 
   SDValue AllocSize = getValue(I.getArraySize());
 
@@ -4026,25 +4025,26 @@
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
   // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
-  unsigned StackAlign =
-      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
-  if (Align <= StackAlign)
-    Align = 0;
+  Align StackAlign = DAG.getSubtarget().getFrameLowering()->getStackAlign();
+  if (Alignment <= StackAlign)
+    Alignment = None;
 
+  const uint64_t StackAlignMask = StackAlign.value() - 1U;
   // Round the size of the allocation up to the stack alignment size
   // by add SA-1 to the size. This doesn't overflow because we're computing
   // an address inside an alloca.
   SDNodeFlags Flags;
   Flags.setNoUnsignedWrap(true);
   AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize,
-                          DAG.getConstant(StackAlign - 1, dl, IntPtr), Flags);
+                          DAG.getConstant(StackAlignMask, dl, IntPtr), Flags);
 
   // Mask out the low bits for alignment purposes.
-  AllocSize =
-      DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize,
-                  DAG.getConstant(~(uint64_t)(StackAlign - 1), dl, IntPtr));
+  AllocSize = DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize,
+                          DAG.getConstant(~StackAlignMask, dl, IntPtr));
 
-  SDValue Ops[] = {getRoot(), AllocSize, DAG.getConstant(Align, dl, IntPtr)};
+  SDValue Ops[] = {
+      getRoot(), AllocSize,
+      DAG.getConstant(Alignment ? Alignment->value() : 0, dl, IntPtr)};
   SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
   SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops);
   setValue(&I, DSA);
diff --git a/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
--- a/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -389,11 +389,9 @@
 TypeStreamMerger::remapIndices(const CVType &OriginalType,
                                MutableArrayRef<uint8_t> Storage) {
   unsigned Align = OriginalType.RecordData.size() & 3;
-  unsigned AlignedSize = alignTo(OriginalType.RecordData.size(), 4);
-  assert(Storage.size() == AlignedSize &&
+  assert(Storage.size() == alignTo(OriginalType.RecordData.size(), 4) &&
          "The storage buffer size is not a multiple of 4 bytes which will "
          "cause misalignment in the output TPI stream!");
-  (void)AlignedSize;
 
   SmallVector<TiReference, 4> Refs;
   discoverTypeIndices(OriginalType.RecordData, Refs);
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -10,10 +10,8 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
 
 #include <condition_variable>
 #if LLVM_ENABLE_THREADS
@@ -22,122 +20,6 @@
 
 #define DEBUG_TYPE "orc"
 
-using namespace llvm;
-
-namespace {
-
-#ifndef NDEBUG
-
-cl::opt<bool> PrintHidden("debug-orc-print-hidden", cl::init(true),
-                          cl::desc("debug print hidden symbols defined by "
-                                   "materialization units"),
-                          cl::Hidden);
-
-cl::opt<bool> PrintCallable("debug-orc-print-callable", cl::init(true),
-                            cl::desc("debug print callable symbols defined by "
-                                     "materialization units"),
-                            cl::Hidden);
-
-cl::opt<bool> PrintData("debug-orc-print-data", cl::init(true),
-                        cl::desc("debug print data symbols defined by "
-                                 "materialization units"),
-                        cl::Hidden);
-
-#endif // NDEBUG
-
-// SetPrinter predicate that prints every element.
-template <typename T> struct PrintAll {
-  bool operator()(const T &E) { return true; }
-};
-
-bool anyPrintSymbolOptionSet() {
-#ifndef NDEBUG
-  return PrintHidden || PrintCallable || PrintData;
-#else
-  return false;
-#endif // NDEBUG
-}
-
-bool flagsMatchCLOpts(const JITSymbolFlags &Flags) {
-#ifndef NDEBUG
-  // Bail out early if this is a hidden symbol and we're not printing hiddens.
-  if (!PrintHidden && !Flags.isExported())
-    return false;
-
-  // Return true if this is callable and we're printing callables.
-  if (PrintCallable && Flags.isCallable())
-    return true;
-
-  // Return true if this is data and we're printing data.
-  if (PrintData && !Flags.isCallable())
-    return true;
-
-  // otherwise return false.
-  return false;
-#else
-  return false;
-#endif // NDEBUG
-}
-
-// Prints a sequence of items, filtered by an user-supplied predicate.
-template <typename Sequence,
-          typename Pred = PrintAll<typename Sequence::value_type>>
-class SequencePrinter {
-public:
-  SequencePrinter(const Sequence &S, char OpenSeq, char CloseSeq,
-                  Pred ShouldPrint = Pred())
-      : S(S), OpenSeq(OpenSeq), CloseSeq(CloseSeq),
-        ShouldPrint(std::move(ShouldPrint)) {}
-
-  void printTo(llvm::raw_ostream &OS) const {
-    bool PrintComma = false;
-    OS << OpenSeq;
-    for (auto &E : S) {
-      if (ShouldPrint(E)) {
-        if (PrintComma)
-          OS << ',';
-        OS << ' ' << E;
-        PrintComma = true;
-      }
-    }
-    OS << ' ' << CloseSeq;
-  }
-
-private:
-  const Sequence &S;
-  char OpenSeq;
-  char CloseSeq;
-  mutable Pred ShouldPrint;
-};
-
-template <typename Sequence, typename Pred>
-SequencePrinter<Sequence, Pred> printSequence(const Sequence &S, char OpenSeq,
-                                              char CloseSeq, Pred P = Pred()) {
-  return SequencePrinter<Sequence, Pred>(S, OpenSeq, CloseSeq, std::move(P));
-}
-
-// Render a SequencePrinter by delegating to its printTo method.
-template <typename Sequence, typename Pred>
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                              const SequencePrinter<Sequence, Pred> &Printer) {
-  Printer.printTo(OS);
-  return OS;
-}
-
-struct PrintSymbolFlagsMapElemsMatchingCLOpts {
-  bool operator()(const orc::SymbolFlagsMap::value_type &KV) {
-    return flagsMatchCLOpts(KV.second);
-  }
-};
-
-struct PrintSymbolMapElemsMatchingCLOpts {
-  bool operator()(const orc::SymbolMap::value_type &KV) {
-    return flagsMatchCLOpts(KV.second.getFlags());
-  }
-};
-
-} // end anonymous namespace
-
 namespace llvm {
 namespace orc {
 
@@ -152,162 +34,6 @@
 
 void MaterializationUnit::anchor() {}
 
-raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym) {
-  return OS << *Sym;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) {
-  return OS << printSequence(Symbols, '{', '}', PrintAll<SymbolStringPtr>());
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols) {
-  return OS << printSequence(Symbols, '[', ']', PrintAll<SymbolStringPtr>());
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
-  if (Flags.hasError())
-    OS << "[*ERROR*]";
-  if (Flags.isCallable())
-    OS << "[Callable]";
-  else
-    OS << "[Data]";
-  if (Flags.isWeak())
-    OS << "[Weak]";
-  else if (Flags.isCommon())
-    OS << "[Common]";
-
-  if (!Flags.isExported())
-    OS << "[Hidden]";
-
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) {
-  return OS << format("0x%016" PRIx64, Sym.getAddress()) << " "
-            << Sym.getFlags();
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV) {
-  return OS << "(\"" << KV.first << "\", " << KV.second << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV) {
-  return OS << "(\"" << KV.first << "\": " << KV.second << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags) {
-  return OS << printSequence(SymbolFlags, '{', '}',
-                             PrintSymbolFlagsMapElemsMatchingCLOpts());
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols) {
-  return OS << printSequence(Symbols, '{', '}',
-                             PrintSymbolMapElemsMatchingCLOpts());
-}
-
-raw_ostream &operator<<(raw_ostream &OS,
-                        const SymbolDependenceMap::value_type &KV) {
-  return OS << "(" << KV.first->getName() << ", " << KV.second << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps) {
-  return OS << printSequence(Deps, '{', '}',
-                             PrintAll<SymbolDependenceMap::value_type>());
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
-  OS << "MU@" << &MU << " (\"" << MU.getName() << "\"";
-  if (anyPrintSymbolOptionSet())
-    OS << ", " << MU.getSymbols();
-  return OS << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K) {
-  switch (K) {
-  case LookupKind::Static:
-    return OS << "Static";
-  case LookupKind::DLSym:
-    return OS << "DLSym";
-  }
-  llvm_unreachable("Invalid lookup kind");
-}
-
-raw_ostream &operator<<(raw_ostream &OS,
-                        const JITDylibLookupFlags &JDLookupFlags) {
-  switch (JDLookupFlags) {
-  case JITDylibLookupFlags::MatchExportedSymbolsOnly:
-    return OS << "MatchExportedSymbolsOnly";
-  case JITDylibLookupFlags::MatchAllSymbols:
-    return OS << "MatchAllSymbols";
-  }
-  llvm_unreachable("Invalid JITDylib lookup flags");
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags) {
-  switch (LookupFlags) {
-  case SymbolLookupFlags::RequiredSymbol:
-    return OS << "RequiredSymbol";
-  case SymbolLookupFlags::WeaklyReferencedSymbol:
-    return OS << "WeaklyReferencedSymbol";
-  }
-  llvm_unreachable("Invalid symbol lookup flags");
-}
-
-raw_ostream &operator<<(raw_ostream &OS,
-                        const SymbolLookupSet::value_type &KV) {
-  return OS << "(" << KV.first << ", " << KV.second << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet) {
-  return OS << printSequence(LookupSet, '{', '}',
-                             PrintAll<SymbolLookupSet::value_type>());
-}
-
-raw_ostream &operator<<(raw_ostream &OS,
-                        const JITDylibSearchOrder &SearchOrder) {
-  OS << "[";
-  if (!SearchOrder.empty()) {
-    assert(SearchOrder.front().first &&
-           "JITDylibList entries must not be null");
-    OS << " (\"" << SearchOrder.front().first->getName() << "\", "
-       << SearchOrder.begin()->second << ")";
-    for (auto &KV :
-         make_range(std::next(SearchOrder.begin(), 1), SearchOrder.end())) {
-      assert(KV.first && "JITDylibList entries must not be null");
-      OS << ", (\"" << KV.first->getName() << "\", " << KV.second << ")";
-    }
-  }
-  OS << " ]";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases) {
-  OS << "{";
-  for (auto &KV : Aliases)
-    OS << " " << *KV.first << ": " << KV.second.Aliasee << " "
-       << KV.second.AliasFlags;
-  OS << " }";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S) {
-  switch (S) {
-  case SymbolState::Invalid:
-    return OS << "Invalid";
-  case SymbolState::NeverSearched:
-    return OS << "Never-Searched";
-  case SymbolState::Materializing:
-    return OS << "Materializing";
-  case SymbolState::Resolved:
-    return OS << "Resolved";
-  case SymbolState::Emitted:
-    return OS << "Emitted";
-  case SymbolState::Ready:
-    return OS << "Ready";
-  }
-  llvm_unreachable("Invalid state");
-}
-
 FailedToMaterialize::FailedToMaterialize(
     std::shared_ptr<SymbolDependenceMap> Symbols)
     : Symbols(std::move(Symbols)) {
@@ -2306,5 +2032,13 @@
   }
 }
 
+#ifndef NDEBUG
+void ExecutionSession::dumpDispatchInfo(JITDylib &JD, MaterializationUnit &MU) {
+  runSessionLocked([&]() {
+    dbgs() << "Dispatching " << MU << " for " << JD.getName() << "\n";
+  });
+}
+#endif // NDEBUG
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
--- a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
@@ -7,17 +7,293 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "orc"
 
+using namespace llvm;
+
+namespace {
+
+#ifndef NDEBUG
+
+cl::opt<bool> PrintHidden("debug-orc-print-hidden", cl::init(true),
+                          cl::desc("debug print hidden symbols defined by "
+                                   "materialization units"),
+                          cl::Hidden);
+
+cl::opt<bool> PrintCallable("debug-orc-print-callable", cl::init(true),
+                            cl::desc("debug print callable symbols defined by "
+                                     "materialization units"),
+                            cl::Hidden);
+
+cl::opt<bool> PrintData("debug-orc-print-data", cl::init(true),
+                        cl::desc("debug print data symbols defined by "
+                                 "materialization units"),
+                        cl::Hidden);
+
+#endif // NDEBUG
+
+// SetPrinter predicate that prints every element.
+template <typename T> struct PrintAll {
+  bool operator()(const T &E) { return true; }
+};
+
+bool anyPrintSymbolOptionSet() {
+#ifndef NDEBUG
+  return PrintHidden || PrintCallable || PrintData;
+#else
+  return false;
+#endif // NDEBUG
+}
+
+bool flagsMatchCLOpts(const JITSymbolFlags &Flags) {
+#ifndef NDEBUG
+  // Bail out early if this is a hidden symbol and we're not printing hiddens.
+  if (!PrintHidden && !Flags.isExported())
+    return false;
+
+  // Return true if this is callable and we're printing callables.
+  if (PrintCallable && Flags.isCallable())
+    return true;
+
+  // Return true if this is data and we're printing data.
+  if (PrintData && !Flags.isCallable())
+    return true;
+
+  // otherwise return false.
+  return false;
+#else
+  return false;
+#endif // NDEBUG
+}
+
+// Prints a sequence of items, filtered by an user-supplied predicate.
+template <typename Sequence,
+          typename Pred = PrintAll<typename Sequence::value_type>>
+class SequencePrinter {
+public:
+  SequencePrinter(const Sequence &S, char OpenSeq, char CloseSeq,
+                  Pred ShouldPrint = Pred())
+      : S(S), OpenSeq(OpenSeq), CloseSeq(CloseSeq),
+        ShouldPrint(std::move(ShouldPrint)) {}
+
+  void printTo(llvm::raw_ostream &OS) const {
+    bool PrintComma = false;
+    OS << OpenSeq;
+    for (auto &E : S) {
+      if (ShouldPrint(E)) {
+        if (PrintComma)
+          OS << ',';
+        OS << ' ' << E;
+        PrintComma = true;
+      }
+    }
+    OS << ' ' << CloseSeq;
+  }
+
+private:
+  const Sequence &S;
+  char OpenSeq;
+  char CloseSeq;
+  mutable Pred ShouldPrint;
+};
+
+template <typename Sequence, typename Pred>
+SequencePrinter<Sequence, Pred> printSequence(const Sequence &S, char OpenSeq,
+                                              char CloseSeq, Pred P = Pred()) {
+  return SequencePrinter<Sequence, Pred>(S, OpenSeq, CloseSeq, std::move(P));
+}
+
+// Render a SequencePrinter by delegating to its printTo method.
+template <typename Sequence, typename Pred>
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              const SequencePrinter<Sequence, Pred> &Printer) {
+  Printer.printTo(OS);
+  return OS;
+}
+
+struct PrintSymbolFlagsMapElemsMatchingCLOpts {
+  bool operator()(const orc::SymbolFlagsMap::value_type &KV) {
+    return flagsMatchCLOpts(KV.second);
+  }
+};
+
+struct PrintSymbolMapElemsMatchingCLOpts {
+  bool operator()(const orc::SymbolMap::value_type &KV) {
+    return flagsMatchCLOpts(KV.second.getFlags());
+  }
+};
+
+} // end anonymous namespace
+
 namespace llvm {
 namespace orc {
 
+raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym) {
+  return OS << *Sym;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) {
+  return OS << printSequence(Symbols, '{', '}', PrintAll<SymbolStringPtr>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols) {
+  return OS << printSequence(Symbols, '[', ']', PrintAll<SymbolStringPtr>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
+  if (Flags.hasError())
+    OS << "[*ERROR*]";
+  if (Flags.isCallable())
+    OS << "[Callable]";
+  else
+    OS << "[Data]";
+  if (Flags.isWeak())
+    OS << "[Weak]";
+  else if (Flags.isCommon())
+    OS << "[Common]";
+
+  if (!Flags.isExported())
+    OS << "[Hidden]";
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) {
+  return OS << format("0x%016" PRIx64, Sym.getAddress()) << " "
+            << Sym.getFlags();
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV) {
+  return OS << "(\"" << KV.first << "\", " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV) {
+  return OS << "(\"" << KV.first << "\": " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags) {
+  return OS << printSequence(SymbolFlags, '{', '}',
+                             PrintSymbolFlagsMapElemsMatchingCLOpts());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols) {
+  return OS << printSequence(Symbols, '{', '}',
+                             PrintSymbolMapElemsMatchingCLOpts());
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const SymbolDependenceMap::value_type &KV) {
+  return OS << "(" << KV.first->getName() << ", " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps) {
+  return OS << printSequence(Deps, '{', '}',
+                             PrintAll<SymbolDependenceMap::value_type>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
+  OS << "MU@" << &MU << " (\"" << MU.getName() << "\"";
+  if (anyPrintSymbolOptionSet())
+    OS << ", " << MU.getSymbols();
+  return OS << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K) {
+  switch (K) {
+  case LookupKind::Static:
+    return OS << "Static";
+  case LookupKind::DLSym:
+    return OS << "DLSym";
+  }
+  llvm_unreachable("Invalid lookup kind");
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibLookupFlags &JDLookupFlags) {
+  switch (JDLookupFlags) {
+  case JITDylibLookupFlags::MatchExportedSymbolsOnly:
+    return OS << "MatchExportedSymbolsOnly";
+  case JITDylibLookupFlags::MatchAllSymbols:
+    return OS << "MatchAllSymbols";
+  }
+  llvm_unreachable("Invalid JITDylib lookup flags");
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags) {
+  switch (LookupFlags) {
+  case SymbolLookupFlags::RequiredSymbol:
+    return OS << "RequiredSymbol";
+  case SymbolLookupFlags::WeaklyReferencedSymbol:
+    return OS << "WeaklyReferencedSymbol";
+  }
+  llvm_unreachable("Invalid symbol lookup flags");
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const SymbolLookupSet::value_type &KV) {
+  return OS << "(" << KV.first << ", " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet) {
+  return OS << printSequence(LookupSet, '{', '}',
+                             PrintAll<SymbolLookupSet::value_type>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibSearchOrder &SearchOrder) {
+  OS << "[";
+  if (!SearchOrder.empty()) {
+    assert(SearchOrder.front().first &&
+           "JITDylibList entries must not be null");
+    OS << " (\"" << SearchOrder.front().first->getName() << "\", "
+       << SearchOrder.begin()->second << ")";
+    for (auto &KV :
+         make_range(std::next(SearchOrder.begin(), 1), SearchOrder.end())) {
+      assert(KV.first && "JITDylibList entries must not be null");
+      OS << ", (\"" << KV.first->getName() << "\", " << KV.second << ")";
+    }
+  }
+  OS << " ]";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases) {
+  OS << "{";
+  for (auto &KV : Aliases)
+    OS << " " << *KV.first << ": " << KV.second.Aliasee << " "
+       << KV.second.AliasFlags;
+  OS << " }";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S) {
+  switch (S) {
+  case SymbolState::Invalid:
+    return OS << "Invalid";
+  case SymbolState::NeverSearched:
+    return OS << "Never-Searched";
+  case SymbolState::Materializing:
+    return OS << "Materializing";
+  case SymbolState::Resolved:
+    return OS << "Resolved";
+  case SymbolState::Emitted:
+    return OS << "Emitted";
+  case SymbolState::Ready:
+    return OS << "Ready";
+  }
+  llvm_unreachable("Invalid state");
+}
+
 DumpObjects::DumpObjects(std::string DumpDir, std::string IdentifierOverride)
     : DumpDir(std::move(DumpDir)),
       IdentifierOverride(std::move(IdentifierOverride)) {
diff --git a/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
--- a/llvm/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/Layer.h"
 
+#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Object/MachO.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -9,6 +9,7 @@
 #include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4783,6 +4783,12 @@
            "Intrinsic does not support vectors", &Call);
     break;
   }
+  case Intrinsic::bswap: {
+    Type *Ty = Call.getType();
+    unsigned Size = Ty->getScalarSizeInBits();
+    Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
+    break;
+  }
   };
 }
 
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -378,7 +378,7 @@
   // For adjcallstackdown we convert it into an 'adiw reg, <amt>' handling
   // the read and write of SP in I/O space.
   if (Amount != 0) {
-    assert(getStackAlignment() == 1 && "Unsupported stack alignment");
+    assert(getStackAlign() == Align(1) && "Unsupported stack alignment");
 
     if (Opcode == TII.getCallFrameSetupOpcode()) {
       fixStackStores(MBB, MI, TII, true);
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -223,8 +223,6 @@
     MachineBasicBlock::iterator I) const {
   const MSP430InstrInfo &TII =
       *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
-  unsigned StackAlign = getStackAlignment();
-
   if (!hasReservedCallFrame(MF)) {
     // If the stack pointer can be changed after prologue, turn the
     // adjcallstackup instruction into a 'sub SP, <amt>' and the
@@ -236,7 +234,7 @@
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+      Amount = alignTo(Amount, getStackAlign());
 
       MachineInstr *New = nullptr;
       if (Old.getOpcode() == TII.getCallFrameSetupOpcode()) {
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -298,6 +298,12 @@
   bool expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                     const MCSubtargetInfo *STI);
 
+  bool expandSle(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandSleImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                    const MCSubtargetInfo *STI);
+
   bool expandRotation(MCInst &Inst, SMLoc IDLoc,
                       MCStreamer &Out, const MCSubtargetInfo *STI);
   bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
@@ -334,6 +340,12 @@
   bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                   const MCSubtargetInfo *STI);
 
+  bool expandSne(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                  const MCSubtargetInfo *STI);
+
   bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                        const MCSubtargetInfo *STI);
 
@@ -349,6 +361,7 @@
   bool parseSetArchDirective();
   bool parseSetFeature(uint64_t Feature);
   bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup.
+  bool parseDirectiveCpAdd(SMLoc Loc);
   bool parseDirectiveCpLoad(SMLoc Loc);
   bool parseDirectiveCpLocal(SMLoc Loc);
   bool parseDirectiveCpRestore(SMLoc Loc);
@@ -2515,6 +2528,14 @@
   case Mips::SGTImm64:
   case Mips::SGTUImm64:
     return expandSgtImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLE:
+  case Mips::SLEU:
+    return expandSle(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLEImm:
+  case Mips::SLEUImm:
+  case Mips::SLEImm64:
+  case Mips::SLEUImm64:
+    return expandSleImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SLTImm64:
     if (isInt<16>(Inst.getOperand(2).getImm())) {
       Inst.setOpcode(Mips::SLTi64);
@@ -2591,6 +2612,10 @@
     return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SEQIMacro:
     return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SNEMacro:
+    return expandSne(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SNEIMacro:
+    return expandSneI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::MFTC0:   case Mips::MTTC0:
   case Mips::MFTGPR:  case Mips::MTTGPR:
   case Mips::MFTLO:   case Mips::MTTLO:
@@ -4639,6 +4664,88 @@
   return false;
 }
 
+bool MipsAsmParser::expandSle(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned OpReg = Inst.getOperand(2).getReg();
+  unsigned OpCode;
+
+  warnIfNoMacro(IDLoc);
+
+  switch (Inst.getOpcode()) {
+  case Mips::SLE:
+    OpCode = Mips::SLT;
+    break;
+  case Mips::SLEU:
+    OpCode = Mips::SLTu;
+    break;
+  default:
+    llvm_unreachable("unexpected 'sge' opcode");
+  }
+
+  // $SrcReg <= $OpReg is equal to (not ($OpReg < $SrcReg))
+  TOut.emitRRR(OpCode, DstReg, OpReg, SrcReg, IDLoc, STI);
+  TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandSleImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                 const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm();
+  unsigned OpRegCode;
+
+  warnIfNoMacro(IDLoc);
+
+  switch (Inst.getOpcode()) {
+  case Mips::SLEImm:
+  case Mips::SLEImm64:
+    OpRegCode = Mips::SLT;
+    break;
+  case Mips::SLEUImm:
+  case Mips::SLEUImm64:
+    OpRegCode = Mips::SLTu;
+    break;
+  default:
+    llvm_unreachable("unexpected 'sge' opcode with immediate");
+  }
+
+  // $SrcReg <= Imm is equal to (not (Imm < $SrcReg))
+  unsigned ImmReg = DstReg;
+  if (DstReg == SrcReg) {
+    unsigned ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+    ImmReg = ATReg;
+  }
+
+  if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
+                    false, IDLoc, Out, STI))
+    return true;
+
+  TOut.emitRRR(OpRegCode, DstReg, ImmReg, SrcReg, IDLoc, STI);
+  TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
+
+  return false;
+}
+
 bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
                                          MCStreamer &Out,
                                          const MCSubtargetInfo *STI) {
@@ -5328,6 +5435,88 @@
   return false;
 }
 
+bool MipsAsmParser::expandSne(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned OpReg = Inst.getOperand(2).getReg();
+
+  warnIfNoMacro(IDLoc);
+
+  if (SrcReg != Mips::ZERO && OpReg != Mips::ZERO) {
+    TOut.emitRRR(Mips::XOR, DstReg, SrcReg, OpReg, IDLoc, STI);
+    TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI);
+    return false;
+  }
+
+  unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg;
+  TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, Reg, IDLoc, STI);
+  return false;
+}
+
+bool MipsAsmParser::expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                               const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm();
+
+  warnIfNoMacro(IDLoc);
+
+  if (ImmValue == 0) {
+    TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, SrcReg, IDLoc, STI);
+    return false;
+  }
+
+  if (SrcReg == Mips::ZERO) {
+    Warning(IDLoc, "comparison is always true");
+    if (loadImmediate(1, DstReg, Mips::NoRegister, true, false, IDLoc, Out,
+                      STI))
+      return true;
+    return false;
+  }
+
+  unsigned Opc;
+  if (ImmValue > -0x8000 && ImmValue < 0) {
+    ImmValue = -ImmValue;
+    Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu;
+  } else {
+    Opc = Mips::XORi;
+  }
+
+  if (isUInt<16>(ImmValue)) {
+    TOut.emitRRI(Opc, DstReg, SrcReg, ImmValue, IDLoc, STI);
+    TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI);
+    return false;
+  }
+
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue),
+                    false, IDLoc, Out, STI))
+    return true;
+
+  TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI);
+  TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI);
+  return false;
+}
+
 // Map the DSP accumulator and control register to the corresponding gpr
 // operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions
 // do not map the DSP registers contigously to gpr registers.
@@ -7444,6 +7633,31 @@
   return inPicMode() && !(isABI_N32() || isABI_N64());
 }
 
+bool MipsAsmParser::parseDirectiveCpAdd(SMLoc Loc) {
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
+  OperandMatchResultTy ResTy = parseAnyRegister(Reg);
+  if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+    reportParseError("expected register");
+    return false;
+  }
+
+  MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
+  if (!RegOpnd.isGPRAsmReg()) {
+    reportParseError(RegOpnd.getStartLoc(), "invalid register");
+    return false;
+  }
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+  getParser().Lex(); // Consume the EndOfStatement.
+
+  getTargetStreamer().emitDirectiveCpAdd(RegOpnd.getGPR32Reg());
+  return false;
+}
+
 bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
   if (AssemblerOptions.back()->isReorder())
     Warning(Loc, ".cpload should be inside a noreorder section");
@@ -8356,6 +8570,10 @@
   MCAsmParser &Parser = getParser();
   StringRef IDVal = DirectiveID.getString();
 
+  if (IDVal == ".cpadd") {
+    parseDirectiveCpAdd(DirectiveID.getLoc());
+    return false;
+  }
   if (IDVal == ".cpload") {
     parseDirectiveCpLoad(DirectiveID.getLoc());
     return false;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -111,6 +111,7 @@
 void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips3D() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoMips3D() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveCpAdd(unsigned RegNo) {}
 void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
 void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) {
   // .cplocal $reg
@@ -662,6 +663,12 @@
   OS << "," << FPUTopSavedRegOff << '\n';
 }
 
+void MipsTargetAsmStreamer::emitDirectiveCpAdd(unsigned RegNo) {
+  OS << "\t.cpadd\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+  forbidModuleDirective();
+}
+
 void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   OS << "\t.cpload\t$"
      << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
@@ -1120,6 +1127,17 @@
   FPROffset = FPUTopSavedRegOff;
 }
 
+void MipsTargetELFStreamer::emitDirectiveCpAdd(unsigned RegNo) {
+  // .cpadd $reg
+  // This directive inserts code to add $gp to the argument's register
+  // when support for position independent code is enabled.
+  if (!Pic)
+    return;
+
+  emitAddu(RegNo, RegNo, GPReg, getABI().IsN64(), &STI);
+  forbidModuleDirective();
+}
+
 void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   // .cpload $reg
   // This directive expands to:
diff --git a/llvm/lib/Target/Mips/Mips64InstrInfo.td b/llvm/lib/Target/Mips/Mips64InstrInfo.td
--- a/llvm/lib/Target/Mips/Mips64InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -1248,5 +1248,19 @@
                                                  GPR64Opnd:$rs,
                                                  imm64:$imm), 0>, GPR_64;
 
+def SLEImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                 (ins GPR64Opnd:$rs, imm64:$imm),
+                                 "sle\t$rd, $rs, $imm">, GPR_64;
+def : MipsInstAlias<"sle $rs, $imm", (SLEImm64 GPR64Opnd:$rs,
+                                               GPR64Opnd:$rs,
+                                               imm64:$imm), 0>, GPR_64;
+
+def SLEUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                  (ins GPR64Opnd:$rs, imm64:$imm),
+                                  "sleu\t$rd, $rs, $imm">, GPR_64;
+def : MipsInstAlias<"sleu $rs, $imm", (SLEUImm64 GPR64Opnd:$rs,
+                                                 GPR64Opnd:$rs,
+                                                 imm64:$imm), 0>, GPR_64;
+
 def : MipsInstAlias<"rdhwr $rt, $rs",
                     (RDHWR64 GPR64Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, GPR_64;
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -179,8 +179,9 @@
       MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
 
   const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
-  unsigned Align = MinAlign(TFL->getStackAlignment(), Offset);
-  MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, Align);
+  Align Alignment = commonAlignment(TFL->getStackAlign(), Offset);
+  MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size,
+                                Alignment.value());
 
   return MIRBuilder.buildFrameIndex(LLT::pointer(0, 32), FI).getReg(0);
 }
diff --git a/llvm/lib/Target/Mips/MipsInstrFPU.td b/llvm/lib/Target/Mips/MipsInstrFPU.td
--- a/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -72,7 +72,7 @@
                        AssemblerPredicate<(all_of (not FeatureSingleFloat))>;
 def IsNotSoftFloat   : Predicate<"!Subtarget->useSoftFloat()">,
                        AssemblerPredicate<(all_of (not FeatureSoftFloat))>;
-def Mips3D           : Predicate<"Subtarget->has3D()">,
+def HasMips3D        : Predicate<"Subtarget->has3D()">,
                        AssemblerPredicate<(all_of FeatureMips3D)>;
 
 //===----------------------------------------------------------------------===//
@@ -479,7 +479,7 @@
 }
 
 let DecoderNamespace = "MipsFP64" in {
-  let AdditionalPredicates = [Mips3D] in {
+  let AdditionalPredicates = [HasMips3D] in {
     def ADDR_PS64   : ADDS_FT<"addr.ps", FGR64Opnd, II_ADDR_PS, 0>,
                       ADDS_FM<0x18, 22>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
     def MULR_PS64   : ADDS_FT<"mulr.ps", FGR64Opnd, II_MULR_PS, 0>,
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -2589,6 +2589,22 @@
                     (SEQIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
                     NOT_ASE_CNMIPS;
 
+def SNEMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                 (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                 "sne $rd, $rs, $rt">, NOT_ASE_CNMIPS;
+
+def : MipsInstAlias<"sne $rd, $rs",
+                    (SNEMacro GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>,
+                    NOT_ASE_CNMIPS;
+
+def SNEIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, simm32_relaxed:$imm),
+                                  "sne $rd, $rs, $imm">, NOT_ASE_CNMIPS;
+
+def : MipsInstAlias<"sne $rd, $imm",
+                    (SNEIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
+                    NOT_ASE_CNMIPS;
+
 def MULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
                                                  simm32_relaxed:$imm),
                                     "mul\t$rd, $rs, $imm">,
@@ -2736,6 +2752,34 @@
                                                  uimm32_coerced:$imm), 0>,
         GPR_32;
 
+  def SLE : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                              (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                              "sle\t$rd, $rs, $rt">, ISA_MIPS1;
+  def : MipsInstAlias<"sle $rs, $rt",
+                      (SLE GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+        ISA_MIPS1;
+  def SLEImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                 (ins GPR32Opnd:$rs, simm32:$imm),
+                                 "sle\t$rd, $rs, $imm">, GPR_32;
+  def : MipsInstAlias<"sle $rs, $imm", (SLEImm GPR32Opnd:$rs,
+                                               GPR32Opnd:$rs,
+                                               simm32:$imm), 0>,
+        GPR_32;
+
+  def SLEU : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                               (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                               "sleu\t$rd, $rs, $rt">, ISA_MIPS1;
+  def : MipsInstAlias<"sleu $rs, $rt",
+                      (SLEU GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+        ISA_MIPS1;
+  def SLEUImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, uimm32_coerced:$imm),
+                                  "sleu\t$rd, $rs, $imm">, GPR_32;
+  def : MipsInstAlias<"sleu $rs, $imm", (SLEUImm GPR32Opnd:$rs,
+                                                 GPR32Opnd:$rs,
+                                                 uimm32_coerced:$imm), 0>,
+        GPR_32;
+
   def : MipsInstAlias<
           "not $rt, $rs",
           (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>, ISA_MIPS1;
diff --git a/llvm/lib/Target/Mips/MipsScheduleP5600.td b/llvm/lib/Target/Mips/MipsScheduleP5600.td
--- a/llvm/lib/Target/Mips/MipsScheduleP5600.td
+++ b/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -20,7 +20,8 @@
                                          IsGP64bit, IsPTR64bit,
                                          InMicroMips, InMips16Mode,
                                          HasCnMips, HasCnMipsP,
-                                         HasDSP, HasDSPR2, HasMT, HasCRC];
+                                         HasDSP, HasDSPR2, HasMips3D, HasMT,
+                                         HasCRC];
 }
 
 let SchedModel = MipsP5600Model in {
@@ -458,8 +459,6 @@
 def : InstRW<[P5600WriteFPUL], (instregex "^FCMP_(S32|D32|D64)$")>;
 def : InstRW<[P5600WriteFPUL], (instregex "^PseudoCVT_(S|D32|D64)_(L|W)$")>;
 def : InstRW<[P5600WriteFPUL], (instrs PLL_PS64, PLU_PS64, PUL_PS64, PUU_PS64)>;
-def : InstRW<[P5600WriteFPUL], (instrs ADDR_PS64, MULR_PS64)>;
-def : InstRW<[P5600WriteFPUL], (instrs CVT_PS_PW64, CVT_PW_PS64)>;
 
 // div.[ds], div.ps
 def : InstRW<[P5600WriteFPUDivS], (instrs FDIV_S)>;
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h
--- a/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -315,6 +315,7 @@
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
   bool hasDSPR3() const { return HasDSPR3; }
+  bool has3D() const { return Has3D; }
   bool hasMSA() const { return HasMSA; }
   bool disableMadd4() const { return DisableMadd4; }
   bool hasEVA() const { return HasEVA; }
diff --git a/llvm/lib/Target/Mips/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MipsTargetStreamer.h
--- a/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -92,6 +92,7 @@
   virtual void emitDirectiveSetHardFloat();
 
   // PIC support
+  virtual void emitDirectiveCpAdd(unsigned RegNo);
   virtual void emitDirectiveCpLoad(unsigned RegNo);
   virtual void emitDirectiveCpLocal(unsigned RegNo);
   virtual bool emitDirectiveCpRestore(int Offset,
@@ -273,6 +274,7 @@
   void emitDirectiveSetHardFloat() override;
 
   // PIC support
+  void emitDirectiveCpAdd(unsigned RegNo) override;
   void emitDirectiveCpLoad(unsigned RegNo) override;
   void emitDirectiveCpLocal(unsigned RegNo) override;
 
@@ -345,6 +347,7 @@
   void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
 
   // PIC support
+  void emitDirectiveCpAdd(unsigned RegNo) override;
   void emitDirectiveCpLoad(unsigned RegNo) override;
   void emitDirectiveCpLocal(unsigned RegNo) override;
   bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -113,8 +113,8 @@
     createMemCpyLoopKnownSize(/* ConvertedInst */ SI,
                               /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
                               /* CopyLen */ CopyLen,
-                              /* SrcAlign */ LI->getAlignment(),
-                              /* DestAlign */ SI->getAlignment(),
+                              /* SrcAlign */ LI->getAlign().valueOrOne(),
+                              /* DestAlign */ SI->getAlign().valueOrOne(),
                               /* SrcIsVolatile */ LI->isVolatile(),
                               /* DstIsVolatile */ SI->isVolatile(), TTI);
 
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -498,9 +498,9 @@
 
   // Get stack alignments.
   const PPCFrameLowering *TFI = getFrameLowering(MF);
-  unsigned TargetAlign = TFI->getStackAlignment();
-  unsigned MaxAlign = MFI.getMaxAlign().value();
-  assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
+  Align TargetAlign = TFI->getStackAlign();
+  Align MaxAlign = MFI.getMaxAlign();
+  assert(isAligned(MaxAlign, maxCallFrameSize) &&
          "Maximum call-frame size not sufficiently aligned");
 
   // Determine the previous frame's address.  If FrameSize can't be
@@ -545,7 +545,7 @@
       // Unfortunately, there is no andi, only andi., and we can't insert that
       // here because we might clobber cr0 while it is live.
       BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg)
-        .addImm(~(MaxAlign-1));
+          .addImm(~(MaxAlign.value() - 1));
 
       unsigned NegSizeReg1 = NegSizeReg;
       NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
@@ -570,7 +570,7 @@
       // Unfortunately, there is no andi, only andi., and we can't insert that
       // here because we might clobber cr0 while it is live.
       BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg)
-        .addImm(~(MaxAlign-1));
+          .addImm(~(MaxAlign.value() - 1));
 
       unsigned NegSizeReg1 = NegSizeReg;
       NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -631,7 +631,6 @@
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   uint64_t StackSize = MFI.getStackSize();
-  uint64_t StackAlign = getStackAlignment();
 
   // Disable SplitSPAdjust if save-restore libcall used. The callee saved
   // registers will be pushed by the save-restore libcalls, so we don't have to
@@ -648,7 +647,7 @@
     // load/store instruction and we have to stick with the stack alignment.
     // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16,
     // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment.
-    return 2048 - StackAlign;
+    return 2048 - getStackAlign().value();
   }
   return 0;
 }
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -38,11 +38,10 @@
 /// lowering implementations a chance to set up their default sections.
 void TargetLoweringObjectFile::Initialize(MCContext &ctx,
                                           const TargetMachine &TM) {
-  Ctx = &ctx;
   // `Initialize` can be called more than once.
   delete Mang;
   Mang = new Mangler();
-  InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(), *Ctx,
+  InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(), ctx,
                        TM.getCodeModel() == CodeModel::Large);
 
   // Reset various EH DWARF encodings.
@@ -121,7 +120,7 @@
   NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix();
   TM.getNameWithPrefix(NameStr, GV, *Mang);
   NameStr.append(Suffix.begin(), Suffix.end());
-  return Ctx->getOrCreateSymbol(NameStr);
+  return getContext().getOrCreateSymbol(NameStr);
 }
 
 MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol(
@@ -353,7 +352,7 @@
 const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   // FIXME: It's not clear what, if any, default this should have - perhaps a
   // null return could mean 'no location' & we should just do that here.
-  return MCSymbolRefExpr::create(Sym, *Ctx);
+  return MCSymbolRefExpr::create(Sym, getContext());
 }
 
 void TargetLoweringObjectFile::getNameWithPrefix(
diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
--- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -198,7 +198,7 @@
   if (CannotReserveFrame)
     return true;
 
-  unsigned StackAlign = TFL->getStackAlignment();
+  Align StackAlign = TFL->getStackAlign();
 
   int64_t Advantage = 0;
   for (auto CC : CallSeqVector) {
@@ -221,7 +221,7 @@
       // We'll need a add after the call.
       Advantage -= 3;
       // If we have to realign the stack, we'll also need a sub before
-      if (CC.ExpectedDist % StackAlign)
+      if (!isAligned(StackAlign, CC.ExpectedDist))
         Advantage -= 3;
       // Now, for each push, we save ~3 bytes. For small constants, we actually,
       // save more (up to 5 bytes), but 3 should be a good approximation.
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3938,10 +3938,8 @@
   const X86InstrInfo &XII = (const X86InstrInfo &)TII;
 
   unsigned Size = DL.getTypeAllocSize(LI->getType());
-  unsigned Alignment = LI->getAlignment();
-
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
-    Alignment = DL.getABITypeAlignment(LI->getType());
+  Align Alignment =
+      DL.getValueOrABITypeAlignment(LI->getAlign(), LI->getType());
 
   SmallVector<MachineOperand, 8> AddrOps;
   AM.getFullAddress(AddrOps);
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -1764,7 +1764,7 @@
   // RBP is not included in the callee saved register block. After pushing RBP,
   // everything is 16 byte aligned. Everything we allocate before an outgoing
   // call must also be 16 byte aligned.
-  unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
+  unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign());
   // Subtract out the size of the callee saved registers. This is how much stack
   // each funclet will allocate.
   return FrameSizeMinusRBP + XMMSize - CSSize;
@@ -2051,7 +2051,8 @@
     return getFrameIndexReference(MF, FI, FrameReg);
 
   FrameReg = TRI->getStackRegister();
-  return alignDown(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
+  return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
+         it->second;
 }
 
 int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
@@ -2996,8 +2997,7 @@
     // We need to keep the stack aligned properly.  To do this, we round the
     // amount of space needed for the outgoing arguments up to the next
     // alignment boundary.
-    unsigned StackAlign = getStackAlignment();
-    Amount = alignTo(Amount, StackAlign);
+    Amount = alignTo(Amount, getStackAlign());
 
     const Function &F = MF.getFunction();
     bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4424,7 +4424,7 @@
 unsigned
 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
                                                SelectionDAG &DAG) const {
-  const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
+  const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
   assert(StackSize % SlotSize == 0 &&
          "StackSize must be a multiple of SlotSize");
@@ -23338,7 +23338,7 @@
                     " not tell us which reg is the stack pointer!");
 
     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
-    const Align StackAlign(TFI.getStackAlignment());
+    const Align StackAlign = TFI.getStackAlign();
     if (hasInlineStackProbe(MF)) {
       MachineRegisterInfo &MRI = MF.getRegInfo();
 
@@ -37809,21 +37809,14 @@
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  // Verify the type we're extracting from is any integer type above i16.
-  EVT VT = Extract->getOperand(0).getValueType();
-  if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
+  EVT ExtractVT = Extract->getValueType(0);
+  // Verify the type we're extracting is either i32 or i64.
+  // FIXME: Could support other types, but this is what we have coverage for.
+  if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
     return SDValue();
 
-  unsigned RegSize = 128;
-  if (Subtarget.useBWIRegs())
-    RegSize = 512;
-  else if (Subtarget.hasAVX())
-    RegSize = 256;
-
-  // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
-  // TODO: We should be able to handle larger vectors by splitting them before
-  // feeding them into several SADs, and then reducing over those.
-  if (RegSize / VT.getVectorNumElements() < 8)
+  EVT VT = Extract->getOperand(0).getValueType();
+  if (!isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
 
   // Match shuffle + add pyramid.
@@ -37839,8 +37832,8 @@
   // (extends the sign bit which is zero).
   // So it is correct to skip the sign/zero extend instruction.
   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
-    Root.getOpcode() == ISD::ZERO_EXTEND ||
-    Root.getOpcode() == ISD::ANY_EXTEND))
+               Root.getOpcode() == ISD::ZERO_EXTEND ||
+               Root.getOpcode() == ISD::ANY_EXTEND))
     Root = Root.getOperand(0);
 
   // If there was a match, we want Root to be a select that is the root of an
@@ -37860,7 +37853,7 @@
   // If the original vector was wider than 8 elements, sum over the results
   // in the SAD vector.
   unsigned Stages = Log2_32(VT.getVectorNumElements());
-  MVT SadVT = SAD.getSimpleValueType();
+  EVT SadVT = SAD.getValueType();
   if (Stages > 3) {
     unsigned SadElems = SadVT.getVectorNumElements();
 
@@ -37875,12 +37868,12 @@
     }
   }
 
-  MVT Type = Extract->getSimpleValueType(0);
-  unsigned TypeSizeInBits = Type.getSizeInBits();
-  // Return the lowest TypeSizeInBits bits.
-  MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
+  unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
+  // Return the lowest ExtractSizeInBits bits.
+  EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
+                               SadVT.getSizeInBits() / ExtractSizeInBits);
   SAD = DAG.getBitcast(ResVT, SAD);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
                      Extract->getOperand(1));
 }
 
@@ -45851,20 +45844,8 @@
 
   // TODO: There's nothing special about i32, any integer type above i16 should
   // work just as well.
-  if (!VT.isVector() || !VT.isSimple() ||
-      !(VT.getVectorElementType() == MVT::i32))
-    return SDValue();
-
-  unsigned RegSize = 128;
-  if (Subtarget.useBWIRegs())
-    RegSize = 512;
-  else if (Subtarget.hasAVX())
-    RegSize = 256;
-
-  // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
-  // TODO: We should be able to handle larger vectors by splitting them before
-  // feeding them into several SADs, and then reducing over those.
-  if (VT.getSizeInBits() / 4 > RegSize)
+  if (!VT.isVector() || !isPowerOf2_32(VT.getVectorNumElements()) ||
+      VT.getVectorElementType() != MVT::i32)
     return SDValue();
 
   // We know N is a reduction add. To match SAD, we need one of the operands to
@@ -45891,9 +45872,7 @@
   // We need to turn the vector of i64 into a vector of i32.
   // If the reduction vector is at least as wide as the psadbw result, just
   // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
-  // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
-  // result to v2i32 which will be removed by type legalization. If we/ widen
-  // narrow vectors then we bitcast to v4i32 and extract v2i32.
+  // the PSADBW will be zero.
   MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
   Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -474,7 +474,7 @@
                                       unsigned OpNum,
                                       ArrayRef<MachineOperand> MOs,
                                       MachineBasicBlock::iterator InsertPt,
-                                      unsigned Size, unsigned Alignment,
+                                      unsigned Size, Align Alignment,
                                       bool AllowCommute) const;
 
   bool isHighLatencyDef(int opc) const override;
@@ -594,7 +594,7 @@
                                         unsigned OpNum,
                                         ArrayRef<MachineOperand> MOs,
                                         MachineBasicBlock::iterator InsertPt,
-                                        unsigned Size, unsigned Align) const;
+                                        unsigned Size, Align Alignment) const;
 
   /// isFrameOperand - Return true and the FrameIndex if the specified
   /// operand and follow operands form a reference to the stack frame.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -625,8 +625,7 @@
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
 
   if (isFrameInstr(MI)) {
-    unsigned StackAlign = TFI->getStackAlignment();
-    int SPAdj = alignTo(getFrameSize(MI), StackAlign);
+    int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
     SPAdj -= getFrameAdjustment(MI);
     if (!isFrameSetup(MI))
       SPAdj = -SPAdj;
@@ -3737,7 +3736,7 @@
          "Stack slot too small for store");
   unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
   bool isAligned =
-      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
       RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
@@ -3752,7 +3751,7 @@
   const MachineFunction &MF = *MBB.getParent();
   unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
   bool isAligned =
-      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
       RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
@@ -5211,7 +5210,7 @@
 MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
-    unsigned Size, unsigned Align) const {
+    unsigned Size, Align Alignment) const {
   switch (MI.getOpcode()) {
   case X86::INSERTPSrr:
   case X86::VINSERTPSrr:
@@ -5227,7 +5226,7 @@
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
       const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
-      if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) {
+      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(4)) {
         int PtrOffset = SrcIdx * 4;
         unsigned NewImm = (DstIdx << 4) | ZMask;
         unsigned NewOpCode =
@@ -5251,7 +5250,7 @@
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
       const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
-      if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) {
+      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
         unsigned NewOpCode =
             (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
             (MI.getOpcode() == X86::VMOVHLPSrr)  ? X86::VMOVLPSrm     :
@@ -5270,7 +5269,7 @@
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
       const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
-      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) {
+      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
         MachineInstr *NewMI =
             FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
         return NewMI;
@@ -5302,11 +5301,10 @@
   return VRegDef && VRegDef->isImplicitDef();
 }
 
-
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
-    unsigned Size, unsigned Align, bool AllowCommute) const {
+    unsigned Size, Align Alignment, bool AllowCommute) const {
   bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
   bool isTwoAddrFold = false;
 
@@ -5346,8 +5344,8 @@
   MachineInstr *NewMI = nullptr;
 
   // Attempt to fold any custom cases we have.
-  if (MachineInstr *CustomMI =
-          foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+  if (MachineInstr *CustomMI = foldMemoryOperandCustom(
+          MF, MI, OpNum, MOs, InsertPt, Size, Alignment))
     return CustomMI;
 
   const X86MemoryFoldTableEntry *I = nullptr;
@@ -5374,9 +5372,9 @@
 
   if (I != nullptr) {
     unsigned Opcode = I->DstOp;
-    unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
-    MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0;
-    if (Align < MinAlign)
+    MaybeAlign MinAlign =
+        decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
+    if (MinAlign && Alignment < *MinAlign)
       return nullptr;
     bool NarrowToMOV32rm = false;
     if (Size) {
@@ -5451,8 +5449,8 @@
       }
 
       // Attempt to fold with the commuted version of the instruction.
-      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
-                                    Size, Align, /*AllowCommute=*/false);
+      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
+                                    Alignment, /*AllowCommute=*/false);
       if (NewMI)
         return NewMI;
 
@@ -5506,12 +5504,12 @@
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned Size = MFI.getObjectSize(FrameIndex);
-  unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
+  Align Alignment = MFI.getObjectAlign(FrameIndex);
   // If the function stack isn't realigned we don't want to fold instructions
   // that need increased alignment.
   if (!RI.needsStackRealignment(MF))
     Alignment =
-        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
+        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -5811,36 +5809,36 @@
     return nullptr;
 
   // Determine the alignment of the load.
-  unsigned Alignment = 0;
+  Align Alignment;
   if (LoadMI.hasOneMemOperand())
-    Alignment = (*LoadMI.memoperands_begin())->getAlignment();
+    Alignment = Align((*LoadMI.memoperands_begin())->getAlignment());
   else
     switch (LoadMI.getOpcode()) {
     case X86::AVX512_512_SET0:
     case X86::AVX512_512_SETALLONES:
-      Alignment = 64;
+      Alignment = Align(64);
       break;
     case X86::AVX2_SETALLONES:
     case X86::AVX1_SETALLONES:
     case X86::AVX_SET0:
     case X86::AVX512_256_SET0:
-      Alignment = 32;
+      Alignment = Align(32);
       break;
     case X86::V_SET0:
     case X86::V_SETALLONES:
     case X86::AVX512_128_SET0:
     case X86::FsFLD0F128:
     case X86::AVX512_FsFLD0F128:
-      Alignment = 16;
+      Alignment = Align(16);
       break;
     case X86::MMX_SET0:
     case X86::FsFLD0SD:
     case X86::AVX512_FsFLD0SD:
-      Alignment = 8;
+      Alignment = Align(8);
       break;
     case X86::FsFLD0SS:
     case X86::AVX512_FsFLD0SS:
-      Alignment = 4;
+      Alignment = Align(4);
       break;
     default:
       return nullptr;
@@ -5929,7 +5927,7 @@
                       Opc == X86::AVX1_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
-    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment.value());
 
     // Create operands to load from the constant pool entry.
     MOs.push_back(MachineOperand::CreateReg(PICBase, false));
diff --git a/llvm/lib/Target/X86/X86TargetObjectFile.h b/llvm/lib/Target/X86/X86TargetObjectFile.h
--- a/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -44,7 +44,6 @@
     X86ELFTargetObjectFile() {
       PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
     }
-    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
     /// Describe a TLS variable address within debug info.
     const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
   };
diff --git a/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/llvm/lib/Target/X86/X86TargetObjectFile.cpp
--- a/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -63,8 +63,3 @@
     const MCSymbol *Sym) const {
   return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
 }
-
-void X86ELFTargetObjectFile::Initialize(MCContext &Ctx,
-                                        const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2648,39 +2648,13 @@
 
 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
                                            bool IsPairwise) {
+  // Just use the default implementation for pair reductions.
+  if (IsPairwise)
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
+
   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
   // and make it as the cost.
 
-  static const CostTblEntry SLMCostTblPairWise[] = {
-    { ISD::FADD,  MVT::v2f64,   3 },
-    { ISD::ADD,   MVT::v2i64,   5 },
-  };
-
-  static const CostTblEntry SSE2CostTblPairWise[] = {
-    { ISD::FADD,  MVT::v2f64,   2 },
-    { ISD::FADD,  MVT::v4f32,   4 },
-    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
-    { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32.
-    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
-    { ISD::ADD,   MVT::v2i16,   3 }, // FIXME: chosen to be less than v4i16
-    { ISD::ADD,   MVT::v4i16,   4 }, // FIXME: chosen to be less than v8i16
-    { ISD::ADD,   MVT::v8i16,   5 },
-    { ISD::ADD,   MVT::v2i8,    2 },
-    { ISD::ADD,   MVT::v4i8,    2 },
-    { ISD::ADD,   MVT::v8i8,    2 },
-    { ISD::ADD,   MVT::v16i8,   3 },
-  };
-
-  static const CostTblEntry AVX1CostTblPairWise[] = {
-    { ISD::FADD,  MVT::v4f64,   5 },
-    { ISD::FADD,  MVT::v8f32,   7 },
-    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
-    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
-    { ISD::ADD,   MVT::v8i32,   5 },
-    { ISD::ADD,   MVT::v16i16,  6 },
-    { ISD::ADD,   MVT::v32i8,   4 },
-  };
-
   static const CostTblEntry SLMCostTblNoPairWise[] = {
     { ISD::FADD,  MVT::v2f64,   3 },
     { ISD::ADD,   MVT::v2i64,   5 },
@@ -2721,62 +2695,44 @@
   EVT VT = TLI->getValueType(DL, ValTy);
   if (VT.isSimple()) {
     MVT MTy = VT.getSimpleVT();
-    if (IsPairwise) {
-      if (ST->isSLM())
-        if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
-          return Entry->Cost;
-
-      if (ST->hasAVX())
-        if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
-          return Entry->Cost;
-
-      if (ST->hasSSE2())
-        if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
-          return Entry->Cost;
-    } else {
-      if (ST->isSLM())
-        if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
-          return Entry->Cost;
+    if (ST->isSLM())
+      if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+        return Entry->Cost;
 
-      if (ST->hasAVX())
-        if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
-          return Entry->Cost;
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+        return Entry->Cost;
 
-      if (ST->hasSSE2())
-        if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
-          return Entry->Cost;
-    }
+    if (ST->hasSSE2())
+      if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+        return Entry->Cost;
   }
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
 
-  if (IsPairwise) {
-    if (ST->isSLM())
-      if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
-
-    if (ST->hasAVX())
-      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  unsigned ArithmeticCost = 0;
+  if (LT.first != 1 && MTy.isVector() &&
+      MTy.getVectorNumElements() < ValTy->getVectorNumElements()) {
+    // Type needs to be split. We need LT.first - 1 arithmetic ops.
+    Type *SingleOpTy = VectorType::get(ValTy->getVectorElementType(),
+                                       MTy.getVectorNumElements());
+    ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy);
+    ArithmeticCost *= LT.first - 1;
+  }
 
-    if (ST->hasSSE2())
-      if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
-  } else {
-    if (ST->isSLM())
-      if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  if (ST->isSLM())
+    if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+      return ArithmeticCost + Entry->Cost;
 
-    if (ST->hasAVX())
-      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+      return ArithmeticCost + Entry->Cost;
 
-    if (ST->hasSSE2())
-      if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
-  }
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+      return ArithmeticCost + Entry->Cost;
 
   // FIXME: These assume a naive kshift+binop lowering, which is probably
   // conservative in most cases.
@@ -2825,9 +2781,9 @@
   };
 
   // Handle bool allof/anyof patterns.
-  if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) {
+  if (ValTy->getVectorElementType()->isIntegerTy(1)) {
     unsigned ArithmeticCost = 0;
-    if (MTy.isVector() &&
+    if (LT.first != 1 && MTy.isVector() &&
         MTy.getVectorNumElements() < ValTy->getVectorNumElements()) {
       // Type needs to be split. We need LT.first - 1 arithmetic ops.
       Type *SingleOpTy = VectorType::get(ValTy->getVectorElementType(),
@@ -2848,9 +2804,77 @@
     if (ST->hasSSE2())
       if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
         return ArithmeticCost + Entry->Cost;
+
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
+  }
+
+  unsigned NumVecElts = ValTy->getVectorNumElements();
+  unsigned ScalarSize = ValTy->getScalarSizeInBits();
+
+  // Special case power of 2 reductions where the scalar type isn't changed
+  // by type legalization.
+  if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
+
+  unsigned ReductionCost = 0;
+
+  Type *Ty = ValTy;
+  if (LT.first != 1 && MTy.isVector() &&
+      MTy.getVectorNumElements() < ValTy->getVectorNumElements()) {
+    // Type needs to be split. We need LT.first - 1 arithmetic ops.
+    Ty = VectorType::get(ValTy->getVectorElementType(),
+                         MTy.getVectorNumElements());
+    ReductionCost = getArithmeticInstrCost(Opcode, Ty);
+    ReductionCost *= LT.first - 1;
+    NumVecElts = MTy.getVectorNumElements();
+  }
+
+  // Now handle reduction with the legal type, taking into account size changes
+  // at each level.
+  while (NumVecElts > 1) {
+    // Determine the size of the remaining vector we need to reduce.
+    unsigned Size = NumVecElts * ScalarSize;
+    NumVecElts /= 2;
+    // If we're reducing from 256/512 bits, use an extract_subvector.
+    if (Size > 128) {
+      Type *SubTy = VectorType::get(ValTy->getVectorElementType(), NumVecElts);
+      ReductionCost +=
+          getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+      Ty = SubTy;
+    } else if (Size == 128) {
+      // Reducing from 128 bits is a permute of v2f64/v2i64.
+      Type *ShufTy;
+      if (ValTy->isFloatingPointTy())
+        ShufTy = VectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
+      else
+        ShufTy = VectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
+      ReductionCost +=
+          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+    } else if (Size == 64) {
+      // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+      Type *ShufTy;
+      if (ValTy->isFloatingPointTy())
+        ShufTy = VectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
+      else
+        ShufTy = VectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
+      ReductionCost +=
+          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+    } else {
+      // Reducing from smaller size is a shift by immediate.
+      Type *ShiftTy = VectorType::get(
+          Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
+      ReductionCost += getArithmeticInstrCost(
+          Instruction::LShr, ShiftTy, TargetTransformInfo::OK_AnyValue,
+          TargetTransformInfo::OK_UniformConstantValue,
+          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+    }
+
+    // Add the arithmetic op for this level.
+    ReductionCost += getArithmeticInstrCost(Opcode, Ty);
   }
 
-  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
+  // Add the final extract element to the cost.
+  return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
 }
 
 int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -270,12 +270,6 @@
   PM.add(createScopedNoAliasAAWrapperPass());
 }
 
-void PassManagerBuilder::addInstructionCombiningPass(
-    legacy::PassManagerBase &PM) const {
-  bool ExpensiveCombines = OptLevel > 2;
-  PM.add(createInstructionCombiningPass(ExpensiveCombines));
-}
-
 void PassManagerBuilder::populateFunctionPassManager(
     legacy::FunctionPassManager &FPM) {
   addExtensionsToPM(EP_EarlyAsPossible, FPM);
@@ -374,7 +368,7 @@
   // Combine silly seq's
   if (OptLevel > 2)
     MPM.add(createAggressiveInstCombinerPass());
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
     MPM.add(createLibCallsShrinkWrapPass());
   addExtensionsToPM(EP_Peephole, MPM);
@@ -409,7 +403,7 @@
   // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
   // need for this.
   MPM.add(createCFGSimplificationPass());
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   // We resume loop passes creating a second loop pipeline here.
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
@@ -440,7 +434,7 @@
 
   // Run instcombine after redundancy elimination to exploit opportunities
   // opened up by them.
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, MPM);
   if (OptLevel > 1) {
     MPM.add(createJumpThreadingPass());         // Thread jumps
@@ -458,7 +452,7 @@
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   // Clean up after everything.
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, MPM);
 
   if (EnableCHR && OptLevel >= 3 &&
@@ -569,7 +563,7 @@
 
   MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
 
-  addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE
+  MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
   addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
 
@@ -741,7 +735,7 @@
   // on -O1 and no #pragma is found). Would be good to have these two passes
   // as function calls, so that we can only pass them when the vectorizer
   // changed the code.
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   if (OptLevel > 1 && ExtraVectorizerPasses) {
     // At higher optimization levels, try to clean up any runtime overlap and
     // alignment checks inserted by the vectorizer. We want to track correllated
@@ -750,11 +744,11 @@
     // and unswitch the runtime checks if possible. Once hoisted, we may have
     // dead (or speculatable) control flows or more combining opportunities.
     MPM.add(createCorrelatedValuePropagationPass());
-    addInstructionCombiningPass(MPM);
+    MPM.add(createInstructionCombiningPass());
     MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
     MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
     MPM.add(createCFGSimplificationPass());
-    addInstructionCombiningPass(MPM);
+    MPM.add(createInstructionCombiningPass());
   }
 
   // Cleanup after loop vectorization, etc. Simplification passes like CVP and
@@ -772,7 +766,7 @@
   }
 
   addExtensionsToPM(EP_Peephole, MPM);
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
 
   if (EnableUnrollAndJam && !DisableUnrollLoops) {
     // Unroll and Jam. We do this before unroll but need to be in a separate
@@ -787,7 +781,7 @@
 
   if (!DisableUnrollLoops) {
     // LoopUnroll may generate some redundency to cleanup.
-    addInstructionCombiningPass(MPM);
+    MPM.add(createInstructionCombiningPass());
 
     // Runtime unrolling will introduce runtime check in loop prologue. If the
     // unrolled loop is a inner loop, then the prologue will be inside the
@@ -925,7 +919,7 @@
   // calls, etc, so let instcombine do this.
   if (OptLevel > 2)
     PM.add(createAggressiveInstCombinerPass());
-  addInstructionCombiningPass(PM);
+  PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
 
   // Inline small functions
@@ -958,7 +952,7 @@
   PM.add(createArgumentPromotionPass());
 
   // The IPO passes may leave cruft around.  Clean up after them.
-  addInstructionCombiningPass(PM);
+  PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
   PM.add(createJumpThreadingPass());
 
@@ -1004,10 +998,10 @@
   // we may have exposed more scalar opportunities. Run parts of the scalar
   // optimizer again at this point.
   PM.add(createVectorCombinePass());
-  addInstructionCombiningPass(PM); // Initial cleanup
+  PM.add(createInstructionCombiningPass()); // Initial cleanup
   PM.add(createCFGSimplificationPass()); // if-convert
   PM.add(createSCCPPass()); // Propagate exposed constants
-  addInstructionCombiningPass(PM); // Clean up again
+  PM.add(createInstructionCombiningPass()); // Clean up again
   PM.add(createBitTrackingDCEPass());
 
   // More scalar chains could be vectorized due to more alias information
@@ -1021,7 +1015,7 @@
   PM.add(createAlignmentFromAssumptionsPass());
 
   // Cleanup and simplify the code after the scalar optimizations.
-  addInstructionCombiningPass(PM);
+  PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
 
   PM.add(createJumpThreadingPass());
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -313,9 +313,6 @@
   // Mode in which we are running the combiner.
   const bool MinimizeSize;
 
-  /// Enable combines that trigger rarely but are costly in compiletime.
-  const bool ExpensiveCombines;
-
   AliasAnalysis *AA;
 
   // Required analyses.
@@ -336,12 +333,12 @@
 
 public:
   InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
-               bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA,
+               bool MinimizeSize, AliasAnalysis *AA,
                AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
                OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
                ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
       : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
-        ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
+        AA(AA), AC(AC), TLI(TLI), DT(DT),
         DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
 
   /// Run the combiner over the entire worklist until it is empty.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -130,11 +130,6 @@
 EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
                                               cl::init(true));
 
-// FIXME: This option is no longer used for anything and may be removed.
-static cl::opt<bool>
-EnableExpensiveCombines("expensive-combines",
-                        cl::desc("Enable expensive instruction combines"));
-
 static cl::opt<unsigned> LimitMaxIterations(
     "instcombine-max-iterations",
     cl::desc("Limit the maximum number of instruction combining iterations"),
@@ -3743,11 +3738,8 @@
     Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
     AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
     OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
-    ProfileSummaryInfo *PSI, bool ExpensiveCombines, unsigned MaxIterations,
-    LoopInfo *LI) {
+    ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
   auto &DL = F.getParent()->getDataLayout();
-  if (EnableExpensiveCombines.getNumOccurrences())
-    ExpensiveCombines = EnableExpensiveCombines;
   MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
 
   /// Builder - This is an IRBuilder that automatically inserts new
@@ -3789,7 +3781,7 @@
 
     MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
-    InstCombiner IC(Worklist, Builder, F.hasMinSize(), ExpensiveCombines, AA,
+    InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA,
                     AC, TLI, DT, ORE, BFI, PSI, DL, LI);
     IC.MaxArraySizeForCombine = MaxArraySize;
 
@@ -3802,11 +3794,10 @@
   return MadeIRChange;
 }
 
-InstCombinePass::InstCombinePass(bool ExpensiveCombines)
-    : ExpensiveCombines(ExpensiveCombines), MaxIterations(LimitMaxIterations) {}
+InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {}
 
-InstCombinePass::InstCombinePass(bool ExpensiveCombines, unsigned MaxIterations)
-    : ExpensiveCombines(ExpensiveCombines), MaxIterations(MaxIterations) {}
+InstCombinePass::InstCombinePass(unsigned MaxIterations)
+    : MaxIterations(MaxIterations) {}
 
 PreservedAnalyses InstCombinePass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
@@ -3826,8 +3817,7 @@
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
 
   if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                       PSI, ExpensiveCombines, MaxIterations,
-                                       LI))
+                                       PSI, MaxIterations, LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -3877,22 +3867,18 @@
       nullptr;
 
   return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                         PSI, ExpensiveCombines, MaxIterations,
-                                         LI);
+                                         PSI, MaxIterations, LI);
 }
 
 char InstructionCombiningPass::ID = 0;
 
-InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines)
-    : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines),
-      MaxIterations(InstCombineDefaultMaxIterations) {
+InstructionCombiningPass::InstructionCombiningPass()
+    : FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) {
   initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
 }
 
-InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines,
-                                                   unsigned MaxIterations)
-    : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines),
-      MaxIterations(MaxIterations) {
+InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations)
+    : FunctionPass(ID), MaxIterations(MaxIterations) {
   initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
 }
 
@@ -3918,13 +3904,12 @@
   initializeInstructionCombiningPassPass(*unwrap(R));
 }
 
-FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) {
-  return new InstructionCombiningPass(ExpensiveCombines);
+FunctionPass *llvm::createInstructionCombiningPass() {
+  return new InstructionCombiningPass();
 }
 
-FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines,
-                                                   unsigned MaxIterations) {
-  return new InstructionCombiningPass(ExpensiveCombines, MaxIterations);
+FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) {
+  return new InstructionCombiningPass(MaxIterations);
 }
 
 void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -16,7 +16,7 @@
 
 void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                                      Value *DstAddr, ConstantInt *CopyLen,
-                                     unsigned SrcAlign, unsigned DstAlign,
+                                     Align SrcAlign, Align DstAlign,
                                      bool SrcIsVolatile, bool DstIsVolatile,
                                      const TargetTransformInfo &TTI) {
   // No need to expand zero length copies.
@@ -33,8 +33,8 @@
   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
   Type *TypeOfCopyLen = CopyLen->getType();
-  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
-                                                   SrcAlign, DstAlign);
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
 
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
   uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
@@ -59,8 +59,8 @@
       DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
     }
 
-    Align PartDstAlign(MinAlign(DstAlign, LoopOpSize));
-    Align PartSrcAlign(MinAlign(SrcAlign, LoopOpSize));
+    Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+    Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
 
     IRBuilder<> LoopBuilder(LoopBB);
     PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
@@ -92,11 +92,12 @@
 
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
-                                          SrcAS, DstAS, SrcAlign, DstAlign);
+                                          SrcAS, DstAS, SrcAlign.value(),
+                                          DstAlign.value());
 
     for (auto OpTy : RemainingOps) {
-      Align PartSrcAlign(MinAlign(SrcAlign, BytesCopied));
-      Align PartDstAlign(MinAlign(DstAlign, BytesCopied));
+      Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+      Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
 
       // Calaculate the new index
       unsigned OperandSize = DL.getTypeStoreSize(OpTy);
@@ -131,8 +132,8 @@
 
 void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
                                        Value *SrcAddr, Value *DstAddr,
-                                       Value *CopyLen, unsigned SrcAlign,
-                                       unsigned DstAlign, bool SrcIsVolatile,
+                                       Value *CopyLen, Align SrcAlign,
+                                       Align DstAlign, bool SrcIsVolatile,
                                        bool DstIsVolatile,
                                        const TargetTransformInfo &TTI) {
   BasicBlock *PreLoopBB = InsertBefore->getParent();
@@ -145,8 +146,8 @@
   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
-  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
-                                                   SrcAlign, DstAlign);
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
 
   IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
@@ -175,8 +176,8 @@
       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
   IRBuilder<> LoopBuilder(LoopBB);
 
-  Align PartSrcAlign(MinAlign(SrcAlign, LoopOpSize));
-  Align PartDstAlign(MinAlign(DstAlign, LoopOpSize));
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
 
   PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
   LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
@@ -288,8 +289,8 @@
 //   return dst;
 // }
 static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
-                              Value *DstAddr, Value *CopyLen, unsigned SrcAlign,
-                              unsigned DstAlign, bool SrcIsVolatile,
+                              Value *DstAddr, Value *CopyLen, Align SrcAlign,
+                              Align DstAlign, bool SrcIsVolatile,
                               bool DstIsVolatile) {
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
@@ -323,8 +324,8 @@
   ExitBB->setName("memmove_done");
 
   unsigned PartSize = DL.getTypeStoreSize(EltTy);
-  Align PartSrcAlign(MinAlign(SrcAlign, PartSize));
-  Align PartDstAlign(MinAlign(DstAlign, PartSize));
+  Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
+  Align PartDstAlign(commonAlignment(DstAlign, PartSize));
 
   // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
   // between both backwards and forward copy clauses.
@@ -375,7 +376,7 @@
 }
 
 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
-                             Value *CopyLen, Value *SetValue, unsigned DstAlign,
+                             Value *CopyLen, Value *SetValue, Align DstAlign,
                              bool IsVolatile) {
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
@@ -399,7 +400,7 @@
   OrigBB->getTerminator()->eraseFromParent();
 
   unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
-  Align PartAlign(MinAlign(DstAlign, PartSize));
+  Align PartAlign(commonAlignment(DstAlign, PartSize));
 
   IRBuilder<> LoopBuilder(LoopBB);
   PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
@@ -421,25 +422,27 @@
 void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
                               const TargetTransformInfo &TTI) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
-    createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy,
-                              /* SrcAddr */ Memcpy->getRawSource(),
-                              /* DstAddr */ Memcpy->getRawDest(),
-                              /* CopyLen */ CI,
-                              /* SrcAlign */ Memcpy->getSourceAlignment(),
-                              /* DestAlign */ Memcpy->getDestAlignment(),
-                              /* SrcIsVolatile */ Memcpy->isVolatile(),
-                              /* DstIsVolatile */ Memcpy->isVolatile(),
-                              /* TargetTransformInfo */ TTI);
+    createMemCpyLoopKnownSize(
+        /* InsertBefore */ Memcpy,
+        /* SrcAddr */ Memcpy->getRawSource(),
+        /* DstAddr */ Memcpy->getRawDest(),
+        /* CopyLen */ CI,
+        /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+        /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+        /* SrcIsVolatile */ Memcpy->isVolatile(),
+        /* DstIsVolatile */ Memcpy->isVolatile(),
+        /* TargetTransformInfo */ TTI);
   } else {
-    createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy,
-                                /* SrcAddr */ Memcpy->getRawSource(),
-                                /* DstAddr */ Memcpy->getRawDest(),
-                                /* CopyLen */ Memcpy->getLength(),
-                                /* SrcAlign */ Memcpy->getSourceAlignment(),
-                                /* DestAlign */ Memcpy->getDestAlignment(),
-                                /* SrcIsVolatile */ Memcpy->isVolatile(),
-                                /* DstIsVolatile */ Memcpy->isVolatile(),
-                                /* TargetTransfomrInfo */ TTI);
+    createMemCpyLoopUnknownSize(
+        /* InsertBefore */ Memcpy,
+        /* SrcAddr */ Memcpy->getRawSource(),
+        /* DstAddr */ Memcpy->getRawDest(),
+        /* CopyLen */ Memcpy->getLength(),
+        /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+        /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+        /* SrcIsVolatile */ Memcpy->isVolatile(),
+        /* DstIsVolatile */ Memcpy->isVolatile(),
+        /* TargetTransfomrInfo */ TTI);
   }
 }
 
@@ -448,8 +451,8 @@
                     /* SrcAddr */ Memmove->getRawSource(),
                     /* DstAddr */ Memmove->getRawDest(),
                     /* CopyLen */ Memmove->getLength(),
-                    /* SrcAlign */ Memmove->getSourceAlignment(),
-                    /* DestAlign */ Memmove->getDestAlignment(),
+                    /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(),
+                    /* DestAlign */ Memmove->getDestAlign().valueOrOne(),
                     /* SrcIsVolatile */ Memmove->isVolatile(),
                     /* DstIsVolatile */ Memmove->isVolatile());
 }
@@ -459,6 +462,6 @@
                    /* DstAddr */ Memset->getRawDest(),
                    /* CopyLen */ Memset->getLength(),
                    /* SetValue */ Memset->getValue(),
-                   /* Alignment */ Memset->getDestAlignment(),
+                   /* Alignment */ Memset->getDestAlign().valueOrOne(),
                    Memset->isVolatile());
 }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5308,8 +5308,10 @@
   // of memory operations where possible.
   SmallVector<Instruction *, 16> Worklist;
   SmallPtrSet<Instruction *, 16> Visited;
-  if (auto *I = dyn_cast<Instruction>(V))
+  if (auto *I = dyn_cast<Instruction>(V)) {
     Worklist.push_back(I);
+    Visited.insert(I);
+  }
 
   // Traverse the expression tree in bottom-up order looking for loads. If we
   // encounter an instruction we don't yet handle, we give up.
@@ -5317,7 +5319,6 @@
   auto FoundUnknownInst = false;
   while (!Worklist.empty() && !FoundUnknownInst) {
     auto *I = Worklist.pop_back_val();
-    Visited.insert(I);
 
     // We should only be looking at scalar instructions here. If the current
     // instruction has a vector type, give up.
@@ -5337,7 +5338,7 @@
              isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
       for (Use &U : I->operands())
         if (auto *J = dyn_cast<Instruction>(U.get()))
-          if (!Visited.count(J))
+          if (Visited.insert(J).second)
             Worklist.push_back(J);
     }
 
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
@@ -14,18 +14,26 @@
 ; SSE-LABEL: 'reduce_i64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
@@ -38,9 +46,9 @@
 ; SLM-LABEL: 'reduce_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
@@ -55,18 +63,26 @@
 ; SSE-LABEL: 'reduce_i32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
@@ -79,9 +95,9 @@
 ; SLM-LABEL: 'reduce_i32'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
@@ -97,27 +113,36 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -134,17 +159,17 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
@@ -162,20 +187,30 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'reduce_i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
@@ -183,8 +218,8 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
@@ -193,8 +228,8 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
@@ -203,8 +238,8 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'reduce_i8'
@@ -212,9 +247,9 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2   = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
@@ -17,21 +17,13 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef)
@@ -58,21 +50,13 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef)
@@ -91,58 +75,31 @@
 }
 
 define i32 @reduce_i16(i32 %arg) {
-; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'reduce_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -158,9 +115,9 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
@@ -173,64 +130,34 @@
 }
 
 define i32 @reduce_i8(i32 %arg) {
-; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'reduce_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
@@ -238,9 +165,9 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
@@ -248,9 +175,9 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2   = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -20,9 +20,9 @@
 ; AVX1-LABEL: 'reduce_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i64'
@@ -93,9 +93,9 @@
 ; AVX1-LABEL: 'reduce_i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i32'
@@ -123,58 +123,40 @@
 }
 
 define i32 @reduce_i16(i32 %arg) {
-; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'reduce_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -190,9 +172,9 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
@@ -205,44 +187,24 @@
 }
 
 define i32 @reduce_i8(i32 %arg) {
-; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'reduce_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'reduce_i8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 171 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 249 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'reduce_i8'
@@ -250,9 +212,9 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
@@ -260,9 +222,9 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
@@ -270,9 +232,9 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
@@ -280,9 +242,9 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2   = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
@@ -17,21 +17,13 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef)
@@ -58,21 +50,13 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef)
@@ -91,58 +75,31 @@
 }
 
 define i32 @reduce_i16(i32 %arg) {
-; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'reduce_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -158,9 +115,9 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
@@ -173,64 +130,34 @@
 }
 
 define i32 @reduce_i8(i32 %arg) {
-; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'reduce_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
@@ -238,9 +165,9 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
@@ -248,9 +175,9 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2   = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
--- a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
@@ -17,21 +17,13 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX1-LABEL: 'reduce_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'reduce_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef)
@@ -58,21 +50,13 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef)
@@ -91,58 +75,31 @@
 }
 
 define i32 @reduce_i16(i32 %arg) {
-; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'reduce_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i16'
@@ -158,9 +115,9 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2  = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
@@ -173,64 +130,34 @@
 }
 
 define i32 @reduce_i8(i32 %arg) {
-; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'reduce_i8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX-LABEL: 'reduce_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'reduce_i8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'reduce_i8'
@@ -238,9 +165,9 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'reduce_i8'
@@ -248,9 +175,9 @@
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2   = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -69,7 +69,7 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX1-LABEL: 'reduction_cost_int'
@@ -99,7 +99,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
@@ -127,7 +127,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -138,7 +138,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -149,7 +149,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -160,7 +160,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -171,7 +171,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -199,7 +199,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -210,7 +210,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -221,7 +221,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -232,7 +232,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -243,7 +243,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -270,7 +270,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -280,7 +280,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -290,7 +290,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -300,7 +300,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -310,7 +310,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -438,7 +438,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction4double'
@@ -462,7 +462,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
   %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -482,7 +482,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8float'
@@ -492,7 +492,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction8float'
@@ -502,7 +502,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction8float'
@@ -532,7 +532,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
   %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -604,7 +604,7 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction4i64'
@@ -628,7 +628,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
   %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -710,7 +710,7 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction8i32'
@@ -740,7 +740,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -759,14 +759,14 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction2double'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction2double'
@@ -806,7 +806,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4float'
@@ -816,7 +816,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4float'
@@ -826,7 +826,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX-LABEL: 'pairwise_reduction4float'
@@ -836,7 +836,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4float'
@@ -846,7 +846,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -868,7 +868,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4double'
@@ -878,7 +878,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4double'
@@ -888,7 +888,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction4double'
@@ -898,7 +898,7 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction4double'
@@ -918,7 +918,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -943,7 +943,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8float'
@@ -956,7 +956,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8float'
@@ -969,7 +969,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction8float'
@@ -982,7 +982,7 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction8float'
@@ -995,7 +995,7 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8float'
@@ -1008,7 +1008,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1026,25 +1026,18 @@
 }
 
 define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
-; SSE-LABEL: 'pairwise_reduction2i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
-;
-; AVX-LABEL: 'pairwise_reduction2i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; CHECK-LABEL: 'pairwise_reduction2i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction2i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
   %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
@@ -1063,7 +1056,7 @@
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4i32'
@@ -1073,7 +1066,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -1105,7 +1098,7 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction4i64'
@@ -1115,7 +1108,7 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4i64'
@@ -1150,7 +1143,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8i16'
@@ -1163,7 +1156,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8i16'
@@ -1176,7 +1169,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; AVX-LABEL: 'pairwise_reduction8i16'
@@ -1189,7 +1182,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8i16'
@@ -1202,7 +1195,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1230,7 +1223,7 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction8i32'
@@ -1243,7 +1236,7 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction8i32'
@@ -1256,7 +1249,7 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8i32'
@@ -1269,7 +1262,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll b/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; Reproducer for a crash in computeKnownBitsFromOperator due to blindly
+; casting from llvm::Operator to ExtractElementInst. That does not work
+; if the Operator is a ConstantExpr.
+@g = global [21 x i32] zeroinitializer
+define i32 @test1(i32 %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[T:%.*]] = sub i32 [[A:%.*]], extractelement (<4 x i32> ptrtoint (<4 x i32*> getelementptr inbounds ([21 x i32], [21 x i32]* @g, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 17>) to <4 x i32>), i32 3)
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %t = sub i32 %a, extractelement (<4 x i32> ptrtoint (<4 x i32 *> getelementptr inbounds ([21 x i32], [21 x i32] * @g, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 17>) to <4 x i32>), i32 3)
+  ret i32 %t
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -o - %s | FileCheck %s
 
 ---
 name: test_unmerge_s32_s64
@@ -694,14 +694,21 @@
     liveins: $vgpr0
     ; CHECK-LABEL: name: test_unmerge_s1_s3
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s3) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[UV:%[0-9]+]]:_(s1), [[UV1:%[0-9]+]]:_(s1), [[UV2:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[TRUNC]](s3)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s1)
-    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s1)
-    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s1)
-    ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32)
-    ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY3]](s32)
+    ; CHECK: $vgpr1 = COPY [[COPY4]](s32)
+    ; CHECK: $vgpr2 = COPY [[COPY5]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s3) = G_TRUNC %0
     %2:_(s1), %3:_(s1), %4:_(s1) = G_UNMERGE_VALUES %1
@@ -720,24 +727,51 @@
     liveins: $vgpr0
     ; CHECK-LABEL: name: test_unmerge_s1_s8
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[UV:%[0-9]+]]:_(s1), [[UV1:%[0-9]+]]:_(s1), [[UV2:%[0-9]+]]:_(s1), [[UV3:%[0-9]+]]:_(s1), [[UV4:%[0-9]+]]:_(s1), [[UV5:%[0-9]+]]:_(s1), [[UV6:%[0-9]+]]:_(s1), [[UV7:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[TRUNC]](s8)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s1)
-    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s1)
-    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s1)
-    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s1)
-    ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s1)
-    ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s1)
-    ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s1)
-    ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s1)
-    ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32)
-    ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32)
-    ; CHECK: $vgpr3 = COPY [[ANYEXT3]](s32)
-    ; CHECK: $vgpr4 = COPY [[ANYEXT4]](s32)
-    ; CHECK: $vgpr5 = COPY [[ANYEXT5]](s32)
-    ; CHECK: $vgpr6 = COPY [[ANYEXT6]](s32)
-    ; CHECK: $vgpr7 = COPY [[ANYEXT7]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C4]](s32)
+    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
+    ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[C5]](s32)
+    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
+    ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C6]](s32)
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
+    ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[C7]](s32)
+    ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+    ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+    ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+    ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+    ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+    ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
+    ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+    ; CHECK: $vgpr0 = COPY [[COPY8]](s32)
+    ; CHECK: $vgpr1 = COPY [[COPY9]](s32)
+    ; CHECK: $vgpr2 = COPY [[COPY10]](s32)
+    ; CHECK: $vgpr3 = COPY [[COPY11]](s32)
+    ; CHECK: $vgpr4 = COPY [[COPY12]](s32)
+    ; CHECK: $vgpr5 = COPY [[COPY13]](s32)
+    ; CHECK: $vgpr6 = COPY [[COPY14]](s32)
+    ; CHECK: $vgpr7 = COPY [[COPY15]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s8) = G_TRUNC %0
     %2:_(s1), %3:_(s1), %4:_(s1), %5:_(s1), %6:_(s1), %7:_(s1), %8:_(s1), %9:_(s1) = G_UNMERGE_VALUES %1
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -2,6 +2,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
 
+; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+
 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
 ; for correctness.
@@ -376,9 +378,10 @@
   %u = load volatile double, double addrspace(1)* %gep.3
   %v = load volatile double, double addrspace(1)* %gep.4
 
-  %tmp0 = fmul double %u, %v
-  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
-  %tmp2 = fsub double %x, %tmp1
+  ; nsz flag is needed since this combine may change sign of zero
+  %tmp0 = fmul nsz double %u, %v
+  %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
+  %tmp2 = fsub nsz double %x, %tmp1
 
   store double %tmp2, double addrspace(1)* %gep.out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
 
+; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+
 ; Make sure we don't form mad with denormals
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
@@ -566,9 +568,10 @@
   %u = load volatile float, float addrspace(1)* %gep.3
   %v = load volatile float, float addrspace(1)* %gep.4
 
-  %tmp0 = fmul float %u, %v
-  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
-  %tmp2 = fsub float %x, %tmp1
+  ; nsz flag is needed since this combine may change sign of zero
+  %tmp0 = fmul nsz float %u, %v
+  %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub nsz float %x, %tmp1
 
   store float %tmp2, float addrspace(1)* %gep.out
   ret void
diff --git a/llvm/test/CodeGen/PowerPC/fma-assoc.ll b/llvm/test/CodeGen/PowerPC/fma-assoc.ll
--- a/llvm/test/CodeGen/PowerPC/fma-assoc.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-assoc.ll
@@ -1,133 +1,110 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -fp-contract=fast -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SAFE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX -check-prefix=CHECK-VSX-SAFE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -fp-contract=fast -enable-unsafe-fp-math -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK -check-prefix=CHECK-UNSAFE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -enable-unsafe-fp-math -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX -check-prefix=CHECK-UNSAFE-VSX %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -fp-contract=fast \
+; RUN:   -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -fp-contract=fast -mattr=+vsx -disable-ppc-vsx-fma-mutation=false \
+; RUN:   -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
 
 define double @test_FMADD_ASSOC1(double %A, double %B, double %C,
+; CHECK-LABEL: test_FMADD_ASSOC1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmul 0, 3, 4
+; CHECK-NEXT:    fmadd 0, 1, 2, 0
+; CHECK-NEXT:    fadd 1, 0, 5
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMADD_ASSOC1:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmuldp 0, 3, 4
+; CHECK-VSX-NEXT:    xsmaddadp 0, 1, 2
+; CHECK-VSX-NEXT:    xsadddp 1, 0, 5
+; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul double %A, %B         ; <double> [#uses=1]
   %G = fmul double %C, %D         ; <double> [#uses=1]
   %H = fadd double %F, %G         ; <double> [#uses=1]
   %I = fadd double %H, %E         ; <double> [#uses=1]
   ret double %I
-; CHECK-SAFE-LABEL: test_FMADD_ASSOC1:
-; CHECK-SAFE: fmul
-; CHECK-SAFE-NEXT: fmadd
-; CHECK-SAFE-NEXT: fadd
-; CHECK-SAFE-NEXT: blr
-
-; CHECK-UNSAFE-LABEL: test_FMADD_ASSOC1:
-; CHECK-UNSAFE: fmadd
-; CHECK-UNSAFE-NEXT: fmadd
-; CHECK-UNSAFE-NEXT: blr
-
-; CHECK-VSX-SAFE-LABEL: test_FMADD_ASSOC1:
-; CHECK-VSX-SAFE: xsmuldp
-; CHECK-VSX-SAFE-NEXT: xsmaddadp
-; CHECK-VSX-SAFE-NEXT: xsadddp
-; CHECK-VSX-SAFE-NEXT: blr
-
-; CHECK-VSX-UNSAFE-LABEL: test_FMADD_ASSOC1:
-; CHECK-VSX-UNSAFE: xsmaddmdp
-; CHECK-VSX-UNSAFE-NEXT: xsmaddadp
-; CHECK-VSX-UNSAFE-NEXT: fmr
-; CHECK-VSX-UNSAFE-NEXT: blr
 }
 
 define double @test_FMADD_ASSOC2(double %A, double %B, double %C,
+; CHECK-LABEL: test_FMADD_ASSOC2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmul 0, 3, 4
+; CHECK-NEXT:    fmadd 0, 1, 2, 0
+; CHECK-NEXT:    fadd 1, 5, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMADD_ASSOC2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmuldp 0, 3, 4
+; CHECK-VSX-NEXT:    xsmaddadp 0, 1, 2
+; CHECK-VSX-NEXT:    xsadddp 1, 5, 0
+; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul double %A, %B         ; <double> [#uses=1]
   %G = fmul double %C, %D         ; <double> [#uses=1]
   %H = fadd double %F, %G         ; <double> [#uses=1]
   %I = fadd double %E, %H         ; <double> [#uses=1]
   ret double %I
-; CHECK-SAFE-LABEL: test_FMADD_ASSOC2:
-; CHECK-SAFE: fmul
-; CHECK-SAFE-NEXT: fmadd
-; CHECK-SAFE-NEXT: fadd
-; CHECK-SAFE-NEXT: blr
-
-; CHECK-UNSAFE-LABEL: test_FMADD_ASSOC2:
-; CHECK-UNSAFE: fmadd
-; CHECK-UNSAFE-NEXT: fmadd
-; CHECK-UNSAFE-NEXT: blr
-
-; CHECK-VSX-SAFE-LABEL: test_FMADD_ASSOC2:
-; CHECK-VSX-SAFE: xsmuldp
-; CHECK-VSX-SAFE-NEXT: xsmaddadp
-; CHECK-VSX-SAFE-NEXT: xsadddp
-; CHECK-VSX-SAFE-NEXT: blr
-
-; CHECK-VSX-UNSAFE-LABEL: test_FMADD_ASSOC2:
-; CHECK-VSX-UNSAFE: xsmaddmdp
-; CHECK-VSX-UNSAFE-NEXT: xsmaddadp
-; CHECK-VSX-UNSAFE-NEXT: fmr
-; CHECK-VSX-UNSAFE-NEXT: blr
 }
 
 define double @test_FMSUB_ASSOC1(double %A, double %B, double %C,
+; CHECK-LABEL: test_FMSUB_ASSOC1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmul 0, 3, 4
+; CHECK-NEXT:    fmadd 0, 1, 2, 0
+; CHECK-NEXT:    fsub 1, 0, 5
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMSUB_ASSOC1:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmuldp 0, 3, 4
+; CHECK-VSX-NEXT:    xsmaddadp 0, 1, 2
+; CHECK-VSX-NEXT:    xssubdp 1, 0, 5
+; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul double %A, %B         ; <double> [#uses=1]
   %G = fmul double %C, %D         ; <double> [#uses=1]
   %H = fadd double %F, %G         ; <double> [#uses=1]
   %I = fsub double %H, %E         ; <double> [#uses=1]
   ret double %I
-; CHECK-SAFE-LABEL: test_FMSUB_ASSOC1:
-; CHECK-SAFE: fmul
-; CHECK-SAFE-NEXT: fmadd
-; CHECK-SAFE-NEXT: fsub
-; CHECK-SAFE-NEXT: blr
-
-; CHECK-UNSAFE-LABEL: test_FMSUB_ASSOC1:
-; CHECK-UNSAFE: fmsub
-; CHECK-UNSAFE-NEXT: fmadd
-; CHECK-UNSAFE-NEXT: blr
-
-; CHECK-SAFE-VSX-LABEL: test_FMSUB_ASSOC1:
-; CHECK-SAFE-VSX: xsmuldp
-; CHECK-SAFE-VSX-NEXT: xsmaddadp
-; CHECK-SAFE-VSX-NEXT: xssubdp
-; CHECK-SAFE-VSX-NEXT: blr
-
-; CHECK-UNSAFE-VSX-LABEL: test_FMSUB_ASSOC1:
-; CHECK-UNSAFE-VSX: xsmsubmdp
-; CHECK-UNSAFE-VSX-NEXT: xsmaddadp
-; CHECK-UNSAFE-VSX-NEXT: fmr
-; CHECK-UNSAFE-VSX-NEXT: blr
 }
 
 define double @test_FMSUB_ASSOC2(double %A, double %B, double %C,
+; CHECK-LABEL: test_FMSUB_ASSOC2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmul 0, 3, 4
+; CHECK-NEXT:    fmadd 0, 1, 2, 0
+; CHECK-NEXT:    fsub 1, 5, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMSUB_ASSOC2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmuldp 0, 3, 4
+; CHECK-VSX-NEXT:    xsmaddadp 0, 1, 2
+; CHECK-VSX-NEXT:    xssubdp 1, 5, 0
+; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul double %A, %B         ; <double> [#uses=1]
   %G = fmul double %C, %D         ; <double> [#uses=1]
   %H = fadd double %F, %G         ; <double> [#uses=1]
   %I = fsub double %E, %H         ; <double> [#uses=1]
   ret double %I
-; CHECK-SAFE-LABEL: test_FMSUB_ASSOC2:
-; CHECK-SAFE: fmul
-; CHECK-SAFE-NEXT: fmadd
-; CHECK-SAFE-NEXT: fsub
-; CHECK-SAFE-NEXT: blr
-
-; CHECK-UNSAFE-LABEL: test_FMSUB_ASSOC2:
-; CHECK-UNSAFE: fnmsub
-; CHECK-UNSAFE-NEXT: fnmsub
-; CHECK-UNSAFE-NEXT: blr
-
-; CHECK-SAFE-VSX-LABEL: test_FMSUB_ASSOC2:
-; CHECK-SAFE-VSX: xsmuldp
-; CHECK-SAFE-VSX-NEXT: xsmaddadp
-; CHECK-SAFE-VSX-NEXT: xssubdp
-; CHECK-SAFE-VSX-NEXT: blr
-
-; CHECK-UNSAFE-VSX-LABEL: test_FMSUB_ASSOC2:
-; CHECK-UNSAFE-VSX: xsnmsubmdp
-; CHECK-UNSAFE-VSX-NEXT: xsnmsubadp
-; CHECK-UNSAFE-VSX-NEXT: fmr
-; CHECK-UNSAFE-VSX-NEXT: blr
 }
 
 define double @test_FMADD_ASSOC_EXT1(float %A, float %B, double %C,
+; CHECK-LABEL: test_FMADD_ASSOC_EXT1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 1, 2, 5
+; CHECK-NEXT:    fmadd 1, 3, 4, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT1:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 1, 2, 5
+; CHECK-VSX-NEXT:    xsmaddadp 1, 3, 4
+; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul float %A, %B         ; <float> [#uses=1]
   %G = fpext float %F to double   ; <double> [#uses=1]
@@ -135,18 +112,21 @@
   %I = fadd double %H, %G         ; <double> [#uses=1]
   %J = fadd double %I, %E         ; <double> [#uses=1]
   ret double %J
-; CHECK-LABEL: test_FMADD_ASSOC_EXT1:
-; CHECK: fmadd
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
-
-; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT1:
-; CHECK-VSX: xsmaddmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMADD_ASSOC_EXT2(float %A, float %B, float %C,
+; CHECK-LABEL: test_FMADD_ASSOC_EXT2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
                                  float %D, double %E) {
   %F = fmul float %A, %B         ; <float> [#uses=1]
   %G = fmul float %C, %D         ; <float> [#uses=1]
@@ -154,19 +134,20 @@
   %I = fpext float %H to double   ; <double> [#uses=1]
   %J = fadd double %I, %E         ; <double> [#uses=1]
   ret double %J
-; CHECK-LABEL: test_FMADD_ASSOC_EXT2:
-; CHECK: fmadd
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
-
-; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT2:
-; CHECK-VSX: xsmaddmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: fmr
-; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMADD_ASSOC_EXT3(float %A, float %B, double %C,
+; CHECK-LABEL: test_FMADD_ASSOC_EXT3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 1, 2, 5
+; CHECK-NEXT:    fmadd 1, 3, 4, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT3:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 1, 2, 5
+; CHECK-VSX-NEXT:    xsmaddadp 1, 3, 4
+; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul float %A, %B          ; <float> [#uses=1]
   %G = fpext float %F to double   ; <double> [#uses=1]
@@ -174,18 +155,21 @@
   %I = fadd double %H, %G         ; <double> [#uses=1]
   %J = fadd double %E, %I         ; <double> [#uses=1]
   ret double %J
-; CHECK-LABEL: test_FMADD_ASSOC_EXT3:
-; CHECK: fmadd
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
-
-; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT3:
-; CHECK-VSX: xsmaddmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMADD_ASSOC_EXT4(float %A, float %B, float %C,
+; CHECK-LABEL: test_FMADD_ASSOC_EXT4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT4:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
                                  float %D, double %E) {
   %F = fmul float %A, %B          ; <float> [#uses=1]
   %G = fmul float %C, %D          ; <float> [#uses=1]
@@ -193,19 +177,20 @@
   %I = fpext float %H to double   ; <double> [#uses=1]
   %J = fadd double %E, %I         ; <double> [#uses=1]
   ret double %J
-; CHECK-LABEL: test_FMADD_ASSOC_EXT4:
-; CHECK: fmadd
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
-
-; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT4:
-; CHECK-VSX: xsmaddmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: fmr
-; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMSUB_ASSOC_EXT1(float %A, float %B, double %C,
+; CHECK-LABEL: test_FMSUB_ASSOC_EXT1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmsub 0, 1, 2, 5
+; CHECK-NEXT:    fmadd 1, 3, 4, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT1:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmsubmdp 1, 2, 5
+; CHECK-VSX-NEXT:    xsmaddadp 1, 3, 4
+; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul float %A, %B          ; <float> [#uses=1]
   %G = fpext float %F to double   ; <double> [#uses=1]
@@ -213,18 +198,21 @@
   %I = fadd double %H, %G         ; <double> [#uses=1]
   %J = fsub double %I, %E         ; <double> [#uses=1]
   ret double %J
-; CHECK-LABEL: test_FMSUB_ASSOC_EXT1:
-; CHECK: fmsub
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
-
-; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT1:
-; CHECK-VSX: xsmsubmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMSUB_ASSOC_EXT2(float %A, float %B, float %C,
+; CHECK-LABEL: test_FMSUB_ASSOC_EXT2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmsub 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmsubmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
                                  float %D, double %E) {
   %F = fmul float %A, %B          ; <float> [#uses=1]
   %G = fmul float %C, %D          ; <float> [#uses=1]
@@ -232,19 +220,20 @@
   %I = fpext float %H to double   ; <double> [#uses=1]
   %J = fsub double %I, %E         ; <double> [#uses=1]
   ret double %J
-; CHECK-LABEL: test_FMSUB_ASSOC_EXT2:
-; CHECK: fmsub
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
-
-; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT2:
-; CHECK-VSX: xsmsubmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: fmr
-; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMSUB_ASSOC_EXT3(float %A, float %B, double %C,
+; CHECK-LABEL: test_FMSUB_ASSOC_EXT3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fnmsub 0, 1, 2, 5
+; CHECK-NEXT:    fnmsub 1, 3, 4, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT3:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsnmsubmdp 1, 2, 5
+; CHECK-VSX-NEXT:    xsnmsubadp 1, 3, 4
+; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul float %A, %B          ; <float> [#uses=1]
   %G = fpext float %F to double   ; <double> [#uses=1]
@@ -252,18 +241,21 @@
   %I = fadd double %H, %G         ; <double> [#uses=1]
   %J = fsub double %E, %I         ; <double> [#uses=1]
   ret double %J
-; CHECK-LABEL: test_FMSUB_ASSOC_EXT3:
-; CHECK: fnmsub
-; CHECK-NEXT: fnmsub
-; CHECK-NEXT: blr
-
-; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT3:
-; CHECK-VSX: xsnmsubmdp
-; CHECK-VSX-NEXT: xsnmsubadp
-; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMSUB_ASSOC_EXT4(float %A, float %B, float %C,
+; CHECK-LABEL: test_FMSUB_ASSOC_EXT4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fnmsub 0, 3, 4, 5
+; CHECK-NEXT:    fnmsub 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT4:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsnmsubmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsnmsubadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
                                  float %D, double %E) {
   %F = fmul float %A, %B          ; <float> [#uses=1]
   %G = fmul float %C, %D          ; <float> [#uses=1]
@@ -271,14 +263,282 @@
   %I = fpext float %H to double   ; <double> [#uses=1]
   %J = fsub double %E, %I         ; <double> [#uses=1]
   ret double %J
-; CHECK-LABEL: test_FMSUB_ASSOC_EXT4:
-; CHECK: fnmsub
-; CHECK-NEXT: fnmsub
-; CHECK-NEXT: blr
+}
 
-; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT4:
-; CHECK-VSX: xsnmsubmdp
-; CHECK-VSX-NEXT: xsnmsubadp
-; CHECK-VSX-NEXT: fmr
-; CHECK-VSX-NEXT: blr
+define double @test_reassoc_FMADD_ASSOC1(double %A, double %B, double %C,
+; CHECK-LABEL: test_reassoc_FMADD_ASSOC1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC1:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul reassoc double %A, %B         ; <double> [#uses=1]
+  %G = fmul reassoc double %C, %D         ; <double> [#uses=1]
+  %H = fadd reassoc double %F, %G         ; <double> [#uses=1]
+  %I = fadd reassoc double %H, %E         ; <double> [#uses=1]
+  ret double %I
+}
+
+define double @test_reassoc_FMADD_ASSOC2(double %A, double %B, double %C,
+; CHECK-LABEL: test_reassoc_FMADD_ASSOC2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul reassoc double %A, %B         ; <double> [#uses=1]
+  %G = fmul reassoc double %C, %D         ; <double> [#uses=1]
+  %H = fadd reassoc double %F, %G         ; <double> [#uses=1]
+  %I = fadd reassoc double %E, %H         ; <double> [#uses=1]
+  ret double %I
+}
+
+define double @test_reassoc_FMSUB_ASSOC1(double %A, double %B, double %C,
+; CHECK-LABEL: test_reassoc_FMSUB_ASSOC1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmsub 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC1:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmsubmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul reassoc double %A, %B         ; <double> [#uses=1]
+  %G = fmul reassoc double %C, %D         ; <double> [#uses=1]
+  %H = fadd reassoc double %F, %G         ; <double> [#uses=1]
+  %I = fsub reassoc double %H, %E         ; <double> [#uses=1]
+  ret double %I
+}
+
+define double @test_reassoc_FMSUB_ASSOC2(double %A, double %B, double %C,
+; CHECK-LABEL: test_reassoc_FMSUB_ASSOC2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmul 0, 3, 4
+; CHECK-NEXT:    fmadd 0, 1, 2, 0
+; CHECK-NEXT:    fsub 1, 5, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmuldp 0, 3, 4
+; CHECK-VSX-NEXT:    xsmaddadp 0, 1, 2
+; CHECK-VSX-NEXT:    xssubdp 1, 5, 0
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul reassoc double %A, %B         ; <double> [#uses=1]
+  %G = fmul reassoc double %C, %D         ; <double> [#uses=1]
+  %H = fadd reassoc double %F, %G         ; <double> [#uses=1]
+  %I = fsub reassoc double %E, %H         ; <double> [#uses=1]
+  ret double %I
+}
+
+define double @test_fast_FMSUB_ASSOC2(double %A, double %B, double %C,
+; CHECK-LABEL: test_fast_FMSUB_ASSOC2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fnmsub 0, 3, 4, 5
+; CHECK-NEXT:    fnmsub 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_fast_FMSUB_ASSOC2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsnmsubmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsnmsubadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul fast double %A, %B         ; <double> [#uses=1]
+  %G = fmul fast double %C, %D         ; <double> [#uses=1]
+  %H = fadd fast double %F, %G         ; <double> [#uses=1]
+  %I = fsub fast double %E, %H         ; <double> [#uses=1]
+  ret double %I
+}
+
+define double @test_reassoc_FMADD_ASSOC_EXT1(float %A, float %B, double %C,
+; CHECK-LABEL: test_reassoc_FMADD_ASSOC_EXT1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 1, 2, 5
+; CHECK-NEXT:    fmadd 1, 3, 4, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC_EXT1:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 1, 2, 5
+; CHECK-VSX-NEXT:    xsmaddadp 1, 3, 4
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul reassoc float %A, %B         ; <float> [#uses=1]
+  %G = fpext float %F to double   ; <double> [#uses=1]
+  %H = fmul reassoc double %C, %D         ; <double> [#uses=1]
+  %I = fadd reassoc double %H, %G         ; <double> [#uses=1]
+  %J = fadd reassoc double %I, %E         ; <double> [#uses=1]
+  ret double %J
+}
+
+define double @test_reassoc_FMADD_ASSOC_EXT2(float %A, float %B, float %C,
+; CHECK-LABEL: test_reassoc_FMADD_ASSOC_EXT2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC_EXT2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
+                                 float %D, double %E) {
+  %F = fmul reassoc float %A, %B         ; <float> [#uses=1]
+  %G = fmul reassoc float %C, %D         ; <float> [#uses=1]
+  %H = fadd reassoc float %F, %G         ; <float> [#uses=1]
+  %I = fpext float %H to double   ; <double> [#uses=1]
+  %J = fadd reassoc double %I, %E         ; <double> [#uses=1]
+  ret double %J
+}
+
+define double @test_reassoc_FMADD_ASSOC_EXT3(float %A, float %B, double %C,
+; CHECK-LABEL: test_reassoc_FMADD_ASSOC_EXT3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 1, 2, 5
+; CHECK-NEXT:    fmadd 1, 3, 4, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC_EXT3:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 1, 2, 5
+; CHECK-VSX-NEXT:    xsmaddadp 1, 3, 4
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul reassoc float %A, %B          ; <float> [#uses=1]
+  %G = fpext float %F to double   ; <double> [#uses=1]
+  %H = fmul reassoc double %C, %D         ; <double> [#uses=1]
+  %I = fadd reassoc double %H, %G         ; <double> [#uses=1]
+  %J = fadd reassoc double %E, %I         ; <double> [#uses=1]
+  ret double %J
+}
+
+define double @test_reassoc_FMADD_ASSOC_EXT4(float %A, float %B, float %C,
+; CHECK-LABEL: test_reassoc_FMADD_ASSOC_EXT4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmadd 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC_EXT4:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmaddmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
+                                 float %D, double %E) {
+  %F = fmul reassoc float %A, %B          ; <float> [#uses=1]
+  %G = fmul reassoc float %C, %D          ; <float> [#uses=1]
+  %H = fadd reassoc float %F, %G          ; <float> [#uses=1]
+  %I = fpext float %H to double   ; <double> [#uses=1]
+  %J = fadd reassoc double %E, %I         ; <double> [#uses=1]
+  ret double %J
+}
+
+define double @test_reassoc_FMSUB_ASSOC_EXT1(float %A, float %B, double %C,
+; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmsub 0, 1, 2, 5
+; CHECK-NEXT:    fmadd 1, 3, 4, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT1:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmsubmdp 1, 2, 5
+; CHECK-VSX-NEXT:    xsmaddadp 1, 3, 4
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul reassoc float %A, %B          ; <float> [#uses=1]
+  %G = fpext float %F to double   ; <double> [#uses=1]
+  %H = fmul reassoc double %C, %D         ; <double> [#uses=1]
+  %I = fadd reassoc double %H, %G         ; <double> [#uses=1]
+  %J = fsub reassoc double %I, %E         ; <double> [#uses=1]
+  ret double %J
+}
+
+define double @test_reassoc_FMSUB_ASSOC_EXT2(float %A, float %B, float %C,
+; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmsub 0, 3, 4, 5
+; CHECK-NEXT:    fmadd 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT2:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsmsubmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsmaddadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
+                                 float %D, double %E) {
+  %F = fmul reassoc float %A, %B          ; <float> [#uses=1]
+  %G = fmul reassoc float %C, %D          ; <float> [#uses=1]
+  %H = fadd reassoc float %F, %G          ; <float> [#uses=1]
+  %I = fpext float %H to double   ; <double> [#uses=1]
+  %J = fsub reassoc double %I, %E         ; <double> [#uses=1]
+  ret double %J
+}
+
+define double @test_reassoc_FMSUB_ASSOC_EXT3(float %A, float %B, double %C,
+; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fnmsub 0, 1, 2, 5
+; CHECK-NEXT:    fnmsub 1, 3, 4, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT3:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsnmsubmdp 1, 2, 5
+; CHECK-VSX-NEXT:    xsnmsubadp 1, 3, 4
+; CHECK-VSX-NEXT:    blr
+                                 double %D, double %E) {
+  %F = fmul reassoc float %A, %B          ; <float> [#uses=1]
+  %G = fpext float %F to double   ; <double> [#uses=1]
+  %H = fmul reassoc double %C, %D         ; <double> [#uses=1]
+  %I = fadd reassoc double %H, %G         ; <double> [#uses=1]
+  %J = fsub reassoc double %E, %I         ; <double> [#uses=1]
+  ret double %J
+}
+
+define double @test_reassoc_FMSUB_ASSOC_EXT4(float %A, float %B, float %C,
+; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fnmsub 0, 3, 4, 5
+; CHECK-NEXT:    fnmsub 1, 1, 2, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT4:
+; CHECK-VSX:       # %bb.0:
+; CHECK-VSX-NEXT:    xsnmsubmdp 3, 4, 5
+; CHECK-VSX-NEXT:    xsnmsubadp 3, 1, 2
+; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    blr
+                                 float %D, double %E) {
+  %F = fmul reassoc float %A, %B          ; <float> [#uses=1]
+  %G = fmul reassoc float %C, %D          ; <float> [#uses=1]
+  %H = fadd reassoc float %F, %G          ; <float> [#uses=1]
+  %I = fpext float %H to double   ; <double> [#uses=1]
+  %J = fsub reassoc double %E, %I         ; <double> [#uses=1]
+  ret double %J
 }
diff --git a/llvm/test/CodeGen/PowerPC/fma-negate.ll b/llvm/test/CodeGen/PowerPC/fma-negate.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fma-negate.ll
@@ -0,0 +1,314 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple powerpc64le -verify-machineinstrs \
+; RUN:   | FileCheck -check-prefix=VSX %s
+; RUN: llc < %s -mtriple powerpc64le -verify-machineinstrs -mattr=-vsx \
+; RUN:   | FileCheck -check-prefix=NO-VSX %s
+
+define double @test_mul_sub_f64(double %a, double %b, double %c) {
+; VSX-LABEL: test_mul_sub_f64:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsnmsubadp 1, 2, 3
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_mul_sub_f64:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsub 1, 2, 3, 1
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fmul contract reassoc double %b, %c
+  %1 = fsub contract reassoc double %a, %0
+  ret double %1
+}
+
+define double @test_2mul_sub_f64(double %a, double %b, double %c, double %d) {
+; VSX-LABEL: test_2mul_sub_f64:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsmuldp 0, 3, 4
+; VSX-NEXT:    xsmsubadp 0, 1, 2
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_2mul_sub_f64:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fmul 0, 3, 4
+; NO-VSX-NEXT:    fmsub 1, 1, 2, 0
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fmul contract reassoc double %a, %b
+  %1 = fmul contract reassoc double %c, %d
+  %2 = fsub contract reassoc double %0, %1
+  ret double %2
+}
+
+define double @test_neg_fma_f64(double %a, double %b, double %c) {
+; VSX-LABEL: test_neg_fma_f64:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsnmsubadp 3, 1, 2
+; VSX-NEXT:    fmr 1, 3
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_neg_fma_f64:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsub 1, 1, 2, 3
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fsub contract reassoc double -0.0, %a
+  %1 = call contract reassoc double @llvm.fma.f64(double %0, double %b,
+                                                  double %c)
+  ret double %1
+}
+
+define float @test_mul_sub_f32(float %a, float %b, float %c) {
+; VSX-LABEL: test_mul_sub_f32:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsnmsubasp 1, 2, 3
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_mul_sub_f32:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsubs 1, 2, 3, 1
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fmul contract reassoc float %b, %c
+  %1 = fsub contract reassoc float %a, %0
+  ret float %1
+}
+
+define float @test_2mul_sub_f32(float %a, float %b, float %c, float %d) {
+; VSX-LABEL: test_2mul_sub_f32:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsmulsp 0, 3, 4
+; VSX-NEXT:    xsmsubasp 0, 1, 2
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_2mul_sub_f32:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fmuls 0, 3, 4
+; NO-VSX-NEXT:    fmsubs 1, 1, 2, 0
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fmul contract reassoc float %a, %b
+  %1 = fmul contract reassoc float %c, %d
+  %2 = fsub contract reassoc float %0, %1
+  ret float %2
+}
+
+define float @test_neg_fma_f32(float %a, float %b, float %c) {
+; VSX-LABEL: test_neg_fma_f32:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsnmsubasp 3, 1, 2
+; VSX-NEXT:    fmr 1, 3
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_neg_fma_f32:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsubs 1, 1, 2, 3
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fsub contract reassoc float -0.0, %a
+  %1 = call contract reassoc float @llvm.fma.f32(float %0, float %b, float %c)
+  ret float %1
+}
+
+define <2 x double> @test_neg_fma_v2f64(<2 x double> %a, <2 x double> %b,
+; VSX-LABEL: test_neg_fma_v2f64:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvnmsubadp 36, 34, 35
+; VSX-NEXT:    vmr 2, 4
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_neg_fma_v2f64:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsub 1, 1, 3, 5
+; NO-VSX-NEXT:    fnmsub 2, 2, 4, 6
+; NO-VSX-NEXT:    blr
+                                        <2 x double> %c) {
+entry:
+  %0 = fsub contract reassoc <2 x double> <double -0.0, double -0.0>, %a
+  %1 = call contract reassoc <2 x double> @llvm.fma.v2f64(<2 x double> %0,
+                                                          <2 x double> %b,
+                                                          <2 x double> %c)
+  ret <2 x double> %1
+}
+
+define <4 x float> @test_neg_fma_v4f32(<4 x float> %a, <4 x float> %b,
+; VSX-LABEL: test_neg_fma_v4f32:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvnmsubasp 36, 34, 35
+; VSX-NEXT:    vmr 2, 4
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_neg_fma_v4f32:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    vspltisb 5, -1
+; NO-VSX-NEXT:    vslw 5, 5, 5
+; NO-VSX-NEXT:    vsubfp 2, 5, 2
+; NO-VSX-NEXT:    vmaddfp 2, 2, 3, 4
+; NO-VSX-NEXT:    blr
+                                       <4 x float> %c) {
+entry:
+  %0 = fsub contract reassoc <4 x float> <float -0.0, float -0.0, float -0.0,
+                                          float -0.0>, %a
+  %1 = call contract reassoc <4 x float> @llvm.fma.v4f32(<4 x float> %0,
+                                                         <4 x float> %b,
+                                                         <4 x float> %c)
+  ret <4 x float> %1
+}
+
+define double @test_fast_mul_sub_f64(double %a, double %b, double %c) {
+; VSX-LABEL: test_fast_mul_sub_f64:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsnmsubadp 1, 2, 3
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_fast_mul_sub_f64:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsub 1, 2, 3, 1
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fmul fast double %b, %c
+  %1 = fsub fast double %a, %0
+  ret double %1
+}
+
+define double @test_fast_2mul_sub_f64(double %a, double %b, double %c,
+; VSX-LABEL: test_fast_2mul_sub_f64:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsmuldp 0, 3, 4
+; VSX-NEXT:    xsmsubadp 0, 1, 2
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_fast_2mul_sub_f64:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fmul 0, 3, 4
+; NO-VSX-NEXT:    fmsub 1, 1, 2, 0
+; NO-VSX-NEXT:    blr
+                                      double %d) {
+entry:
+  %0 = fmul fast double %a, %b
+  %1 = fmul fast double %c, %d
+  %2 = fsub fast double %0, %1
+  ret double %2
+}
+
+define double @test_fast_neg_fma_f64(double %a, double %b, double %c) {
+; VSX-LABEL: test_fast_neg_fma_f64:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsnmsubadp 3, 1, 2
+; VSX-NEXT:    fmr 1, 3
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_fast_neg_fma_f64:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsub 1, 1, 2, 3
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fsub fast double -0.0, %a
+  %1 = call fast double @llvm.fma.f64(double %0, double %b, double %c)
+  ret double %1
+}
+
+define float @test_fast_mul_sub_f32(float %a, float %b, float %c) {
+; VSX-LABEL: test_fast_mul_sub_f32:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsnmsubasp 1, 2, 3
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_fast_mul_sub_f32:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsubs 1, 2, 3, 1
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fmul fast float %b, %c
+  %1 = fsub fast float %a, %0
+  ret float %1
+}
+
+define float @test_fast_2mul_sub_f32(float %a, float %b, float %c, float %d) {
+; VSX-LABEL: test_fast_2mul_sub_f32:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsmulsp 0, 3, 4
+; VSX-NEXT:    xsmsubasp 0, 1, 2
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_fast_2mul_sub_f32:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fmuls 0, 3, 4
+; NO-VSX-NEXT:    fmsubs 1, 1, 2, 0
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fmul fast float %a, %b
+  %1 = fmul fast float %c, %d
+  %2 = fsub fast float %0, %1
+  ret float %2
+}
+
+define float @test_fast_neg_fma_f32(float %a, float %b, float %c) {
+; VSX-LABEL: test_fast_neg_fma_f32:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xsnmsubasp 3, 1, 2
+; VSX-NEXT:    fmr 1, 3
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_fast_neg_fma_f32:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsubs 1, 1, 2, 3
+; NO-VSX-NEXT:    blr
+entry:
+  %0 = fsub fast float -0.0, %a
+  %1 = call fast float @llvm.fma.f32(float %0, float %b, float %c)
+  ret float %1
+}
+
+define <2 x double> @test_fast_neg_fma_v2f64(<2 x double> %a, <2 x double> %b,
+; VSX-LABEL: test_fast_neg_fma_v2f64:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvnmsubadp 36, 34, 35
+; VSX-NEXT:    vmr 2, 4
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_fast_neg_fma_v2f64:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    fnmsub 1, 1, 3, 5
+; NO-VSX-NEXT:    fnmsub 2, 2, 4, 6
+; NO-VSX-NEXT:    blr
+                                             <2 x double> %c) {
+entry:
+  %0 = fsub fast <2 x double> <double -0.0, double -0.0>, %a
+  %1 = call fast <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %b,
+                                              <2 x double> %c)
+  ret <2 x double> %1
+}
+
+define <4 x float> @test_fast_neg_fma_v4f32(<4 x float> %a, <4 x float> %b,
+; VSX-LABEL: test_fast_neg_fma_v4f32:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvnmsubasp 36, 34, 35
+; VSX-NEXT:    vmr 2, 4
+; VSX-NEXT:    blr
+;
+; NO-VSX-LABEL: test_fast_neg_fma_v4f32:
+; NO-VSX:       # %bb.0: # %entry
+; NO-VSX-NEXT:    vspltisb 5, -1
+; NO-VSX-NEXT:    vslw 5, 5, 5
+; NO-VSX-NEXT:    vsubfp 2, 5, 2
+; NO-VSX-NEXT:    vmaddfp 2, 2, 3, 4
+; NO-VSX-NEXT:    blr
+                                            <4 x float> %c) {
+entry:
+  %0 = fsub fast <4 x float> <float -0.0, float -0.0, float -0.0,
+                              float -0.0>, %a
+  %1 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %b,
+                                             <4 x float> %c)
+  ret <4 x float> %1
+}
+
+declare float  @llvm.fma.f32(float  %a, float  %b, float  %c)
+declare double @llvm.fma.f64(double %a, double %b, double %c)
+declare <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b,
+                                                    <4 x float> %c)
+declare <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b,
+                                                      <2 x double> %c)
diff --git a/llvm/test/CodeGen/Thumb/frame-access.ll b/llvm/test/CodeGen/Thumb/frame-access.ll
--- a/llvm/test/CodeGen/Thumb/frame-access.ll
+++ b/llvm/test/CodeGen/Thumb/frame-access.ll
@@ -404,8 +404,8 @@
 ; CHECK-NEXT: sub sp, #508
 ; CHECK-NEXT: sub sp, #8
 ; Argument addresses computed relative to BP
-; CHECK:      adds r0, r6, #7
-; CHECK-NEXT: adds r0, #13
+; CHECK:      adds r4, r6, #7
+; CHECK-NEXT: adds r4, #13
 ; CHECK:      adds r1, r6, #7
 ; CHECK-NEXT: adds r1, #9
 ; CHECK:      adds r5, r6, #7
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -400,18 +400,16 @@
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB4_1: @ %bb9
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
+; CHECK-NEXT:    vldrwt.u32 q1, [r0]
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q0, [r0]
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB4_1
 ; CHECK-NEXT:  @ %bb.2: @ %bb27
 ; CHECK-NEXT:    pop {r7, pc}
@@ -464,13 +462,12 @@
 ; CHECK-NEXT:    bic r12, r12, #3
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
-; CHECK-NEXT:    mov r12, r0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB5_1: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r12], #16
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-NEXT:    vpttt.i32 ne, q0, zr
 ; CHECK-NEXT:    vcmpt.s32 le, q0, r2
 ; CHECK-NEXT:    vctpt.32 r3
@@ -478,8 +475,7 @@
 ; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q0, [r0]
-; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
 ; CHECK-NEXT:    le lr, .LBB5_1
 ; CHECK-NEXT:  @ %bb.2: @ %bb32
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
@@ -8,14 +8,13 @@
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    dlstp.16 lr, r2
-; CHECK:  	.LBB0_1: @ %vector.body
-; CHECK:    	 vldrb.s16 q0, [r1], #8
-; CHECK-NEXT:    vldrh.u16 q1, [r3], #16
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.s16 q0, [r1], #8
+; CHECK-NEXT:    vldrh.u16 q1, [r0]
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
-; CHECK-NEXT:    vstrh.16 q0, [r0]
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    vstrh.16 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -63,14 +62,13 @@
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    dlstp.16 lr, r2
-; CHECK:  	.LBB1_1: @ %vector.body
-; CHECK:    	 vldrb.u16 q0, [r1], #8
-; CHECK-NEXT:    vldrh.u16 q1, [r3], #16
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u16 q0, [r1], #8
+; CHECK-NEXT:    vldrh.u16 q1, [r0]
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
-; CHECK-NEXT:    vstrh.16 q0, [r0]
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    vstrh.16 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB1_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -118,14 +116,13 @@
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK:  	.LBB2_1: @ %vector.body
-; CHECK:    	 vldrh.s32 q0, [r1], #8
-; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrh.s32 q0, [r1], #8
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vadd.i32 q0, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    vstrw.32 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB2_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -173,14 +170,13 @@
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK:  	 .LBB3_1: @ %vector.body
-; CHECK:    	 vldrh.u32 q0, [r1], #8
-; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrh.u32 q0, [r1], #8
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vadd.i32 q0, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    vstrw.32 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB3_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -133,26 +133,24 @@
 ; CHECK-NEXT:    bic r12, r12, #3
 ; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    subs r3, #1
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
-; CHECK-NEXT:    vdup.32 q1, r3
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    sub.w r12, r3, #1
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vdup.32 q2, r12
-; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    vdup.32 q2, r3
+; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    vorr q2, q2, q0
 ; CHECK-NEXT:    vpttt.u32 cs, q1, q2
 ; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
 ; CHECK-NEXT:    vldrwt.u32 q3, [r1], #16
-; CHECK-NEXT:    vldrwt.u32 q4, [r3], #16
+; CHECK-NEXT:    vldrwt.u32 q4, [r2]
 ; CHECK-NEXT:    vfma.f32 q4, q3, q2
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q4, [r2]
-; CHECK-NEXT:    mov r2, r3
+; CHECK-NEXT:    vstrwt.32 q4, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB1_2
 ; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9}
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -87,6 +87,193 @@
   ret void
 }
 
+define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind {
+; SSE2-LABEL: avg_v24i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm5
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm6
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm8, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm9, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm10, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT:    psubd %xmm6, %xmm3
+; SSE2-NEXT:    psubd %xmm6, %xmm2
+; SSE2-NEXT:    psubd %xmm6, %xmm4
+; SSE2-NEXT:    psubd %xmm6, %xmm0
+; SSE2-NEXT:    psubd %xmm6, %xmm5
+; SSE2-NEXT:    psubd %xmm6, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    packuswb %xmm5, %xmm1
+; SSE2-NEXT:    packuswb %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: avg_v24i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm6
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm5, %xmm8, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vmovq %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: avg_v24i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq 8(%rdi), %xmm0
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpbroadcastq 8(%rsi), %xmm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpsubd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[2,3]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm1, (%rax)
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: avg_v24i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vpsubd %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512-NEXT:    vmovq %xmm1, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = load <24 x i8>, <24 x i8>* %a
+  %2 = load <24 x i8>, <24 x i8>* %b
+  %3 = zext <24 x i8> %1 to <24 x i32>
+  %4 = zext <24 x i8> %2 to <24 x i32>
+  %5 = add nuw nsw <24 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <24 x i32> %5, %4
+  %7 = lshr <24 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <24 x i32> %7 to <24 x i8>
+  store <24 x i8> %8, <24 x i8>* undef, align 4
+  ret void
+}
+
 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
 ; SSE2-LABEL: avg_v32i8:
 ; SSE2:       # %bb.0:
@@ -724,6 +911,210 @@
   ret void
 }
 
+define void @avg_v40i16(<40 x i16>* %a, <40 x i16>* %b) nounwind {
+; SSE2-LABEL: avg_v40i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa 64(%rdi), %xmm10
+; SSE2-NEXT:    movdqa (%rdi), %xmm5
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm6
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm13
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm12
+; SSE2-NEXT:    movdqa 64(%rsi), %xmm8
+; SSE2-NEXT:    movdqa (%rsi), %xmm1
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm14
+; SSE2-NEXT:    movdqa 32(%rsi), %xmm11
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm9
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    paddd %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm14, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm13, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm14
+; SSE2-NEXT:    movdqa %xmm11, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm7
+; SSE2-NEXT:    movdqa %xmm12, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
+; SSE2-NEXT:    paddd %xmm13, %xmm11
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm10, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
+; SSE2-NEXT:    paddd %xmm12, %xmm9
+; SSE2-NEXT:    movdqa %xmm8, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; SSE2-NEXT:    paddd %xmm10, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm4
+; SSE2-NEXT:    psubd %xmm0, %xmm1
+; SSE2-NEXT:    psubd %xmm0, %xmm3
+; SSE2-NEXT:    psubd %xmm0, %xmm14
+; SSE2-NEXT:    psubd %xmm0, %xmm7
+; SSE2-NEXT:    psubd %xmm0, %xmm11
+; SSE2-NEXT:    psubd %xmm0, %xmm6
+; SSE2-NEXT:    psubd %xmm0, %xmm9
+; SSE2-NEXT:    psubd %xmm0, %xmm5
+; SSE2-NEXT:    psubd %xmm0, %xmm8
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    psrld $1, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    psrld $1, %xmm11
+; SSE2-NEXT:    psrld $1, %xmm7
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT:    psrld $1, %xmm9
+; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm9[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSE2-NEXT:    psrld $1, %xmm8
+; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm8[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
+; SSE2-NEXT:    movdqu %xmm5, (%rax)
+; SSE2-NEXT:    movdqu %xmm4, (%rax)
+; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: avg_v40i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX1-NEXT:    vpavgw 64(%rsi), %xmm4, %xmm4
+; AVX1-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgw 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT:    vpavgw 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm4, (%rax)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: avg_v40i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX2-NEXT:    vpavgw 64(%rsi), %xmm4, %xmm4
+; AVX2-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    vpavgw 16(%rsi), %xmm1, %xmm1
+; AVX2-NEXT:    vpavgw 32(%rsi), %xmm2, %xmm2
+; AVX2-NEXT:    vpavgw 48(%rsi), %xmm3, %xmm3
+; AVX2-NEXT:    vmovdqu %xmm3, (%rax)
+; AVX2-NEXT:    vmovdqu %xmm2, (%rax)
+; AVX2-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vmovdqu %xmm4, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: avg_v40i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsubd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrld $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vmovdqu %xmm2, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v40i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512BW-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-NEXT:    vpsubd %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsrld $1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm2, %ymm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
+; AVX512BW-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %1 = load <40 x i16>, <40 x i16>* %a
+  %2 = load <40 x i16>, <40 x i16>* %b
+  %3 = zext <40 x i16> %1 to <40 x i32>
+  %4 = zext <40 x i16> %2 to <40 x i32>
+  %5 = add nuw nsw <40 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <40 x i32> %5, %4
+  %7 = lshr <40 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <40 x i32> %7 to <40 x i16>
+  store <40 x i16> %8, <40 x i16>* undef, align 4
+  ret void
+}
+
 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
 ; SSE2-LABEL: avg_v4i8_2:
 ; SSE2:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -151,135 +151,30 @@
 define i32 @sad_32i8() nounwind {
 ; SSE2-LABEL: sad_32i8:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pxor %xmm12, %xmm12
-; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pxor %xmm13, %xmm13
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm15, %xmm15
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm14, %xmm14
+; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm8
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
+; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
-; SSE2-NEXT:    movdqa b+1024(%rax), %xmm11
-; SSE2-NEXT:    movdqa %xmm11, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm7
-; SSE2-NEXT:    movdqa b+1040(%rax), %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm10, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
-; SSE2-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm11, %xmm3
-; SSE2-NEXT:    movdqa %xmm6, %xmm10
-; SSE2-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm5
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm6, %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
-; SSE2-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm6, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm9, %xmm8
-; SSE2-NEXT:    movdqa %xmm7, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm7
-; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm7, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm6
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm13
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm15
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm8, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm8
-; SSE2-NEXT:    pxor %xmm0, %xmm8
-; SSE2-NEXT:    paddd %xmm8, %xmm14
+; SSE2-NEXT:    psadbw b+1024(%rax), %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm2
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm15, %xmm0
-; SSE2-NEXT:    paddd %xmm14, %xmm13
-; SSE2-NEXT:    paddd %xmm0, %xmm13
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm13, %xmm6
-; SSE2-NEXT:    paddd %xmm0, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm6, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm0, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -416,452 +311,99 @@
 define i32 @sad_avx64i8() nounwind {
 ; SSE2-LABEL: sad_avx64i8:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    subq $200, %rsp
-; SSE2-NEXT:    pxor %xmm14, %xmm14
+; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movaps a+1040(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa a+1024(%rax), %xmm12
-; SSE2-NEXT:    movdqa a+1056(%rax), %xmm15
-; SSE2-NEXT:    movdqa a+1072(%rax), %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; SSE2-NEXT:    movdqa %xmm15, %xmm11
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm11, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm15, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; SSE2-NEXT:    movdqa %xmm12, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm12, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
-; SSE2-NEXT:    movdqa b+1072(%rax), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa b+1056(%rax), %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm7, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm7, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm7, %xmm8
-; SSE2-NEXT:    movdqa b+1024(%rax), %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm11
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm0, %xmm15
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm0, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm0, %xmm13
-; SSE2-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm9, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm7, %xmm12
-; SSE2-NEXT:    movdqa b+1040(%rax), %xmm13
-; SSE2-NEXT:    movdqa %xmm13, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm7, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm3, %xmm9
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm13, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm13, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm13
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm6
-; SSE2-NEXT:    pxor %xmm1, %xmm6
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm6, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa a+1056(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1056(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1040(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm3
+; SSE2-NEXT:    movdqa a+1024(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1024(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    movdqa a+1072(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1072(%rax), %xmm5
 ; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm8
-; SSE2-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm11
-; SSE2-NEXT:    pxor %xmm1, %xmm11
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm15, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm15
-; SSE2-NEXT:    pxor %xmm1, %xmm15
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm15, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm10
-; SSE2-NEXT:    pxor %xmm1, %xmm10
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm10, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm12, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm12
-; SSE2-NEXT:    pxor %xmm1, %xmm12
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm12, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm9, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    pxor %xmm0, %xmm7
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm13, %xmm1
-; SSE2-NEXT:    movdqa %xmm13, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB2_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd (%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm5
+; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm5, %xmm1
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    addq $200, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: sad_avx64i8:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    subq $24, %rsp
-; AVX1-NEXT:    vpxor %xmm14, %xmm14, %xmm14
-; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; AVX1-NEXT:    vpxor %xmm15, %xmm15, %xmm15
-; AVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX1-NEXT:    vpxor %xmm13, %xmm13, %xmm13
 ; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
-; AVX1-NEXT:    vpxor %xmm10, %xmm10, %xmm10
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
+; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB2_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vmovdqa %ymm7, %ymm11
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm7, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm4
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm3
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm0
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm1, %xmm7, %xmm1
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpsubd %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpabsd %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm11, %xmm7
-; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpabsd %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm11, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm7
-; AVX1-NEXT:    vpabsd %xmm6, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd %xmm5, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm15, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm15
-; AVX1-NEXT:    vpabsd %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd %xmm3, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm14, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm14
-; AVX1-NEXT:    vpabsd %xmm4, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX1-NEXT:    vpaddd %xmm0, %xmm13, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm13
-; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT:    vpaddd %xmm1, %xmm8, %xmm1
-; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm8
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm0
-; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT:    vpaddd %xmm1, %xmm9, %xmm1
-; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm0
-; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT:    vpaddd %xmm1, %xmm10, %xmm1
-; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm10
-; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
-; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpabsd (%rsp), %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT:    vpaddd %xmm1, %xmm12, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm12
+; AVX1-NEXT:    vmovdqa a+1072(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1072(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa a+1056(%rax), %xmm4
+; AVX1-NEXT:    vpsadbw b+1056(%rax), %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm4
+; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    addq $4, %rax
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
-; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpaddd %xmm8, %xmm8, %xmm7
+; AVX1-NEXT:    vpaddd %xmm8, %xmm8, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpaddd %xmm12, %xmm13, %xmm1
-; AVX1-NEXT:    vpaddd %xmm7, %xmm10, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm8, %xmm2
-; AVX1-NEXT:    vpaddd %xmm1, %xmm9, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm15, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpaddd %xmm0, %xmm14, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    addq $24, %rsp
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -869,68 +411,25 @@
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    .p2align 4, 0x90
 ; AVX2-NEXT:  .LBB2_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm8
-; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm10, %ymm10
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm11, %ymm11
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm12, %ymm12
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm13, %ymm13
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm15
-; AVX2-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpabsd %ymm9, %ymm8
-; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
-; AVX2-NEXT:    vpabsd %ymm10, %ymm8
-; AVX2-NEXT:    vpaddd %ymm6, %ymm8, %ymm6
-; AVX2-NEXT:    vpabsd %ymm11, %ymm8
-; AVX2-NEXT:    vpaddd %ymm3, %ymm8, %ymm3
-; AVX2-NEXT:    vpabsd %ymm12, %ymm8
-; AVX2-NEXT:    vpaddd %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpabsd %ymm13, %ymm8
-; AVX2-NEXT:    vpaddd %ymm2, %ymm8, %ymm2
-; AVX2-NEXT:    vpabsd %ymm14, %ymm8
-; AVX2-NEXT:    vpaddd %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vpabsd %ymm15, %ymm8
-; AVX2-NEXT:    vpaddd %ymm4, %ymm8, %ymm4
+; AVX2-NEXT:    vmovdqa a+1056(%rax), %ymm3
+; AVX2-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovdqa a+1024(%rax), %ymm3
+; AVX2-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    addq $4, %rax
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %middle.block
-; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm7, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm3
+; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -947,37 +446,21 @@
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    .p2align 4, 0x90
 ; AVX512F-NEXT:  .LBB2_1: # %vector.body
 ; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpsubd %zmm8, %zmm4, %zmm4
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpsubd %zmm8, %zmm5, %zmm5
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpsubd %zmm8, %zmm6, %zmm6
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpsubd %zmm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vpabsd %zmm4, %zmm4
-; AVX512F-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
-; AVX512F-NEXT:    vpabsd %zmm5, %zmm4
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm4, %zmm1
-; AVX512F-NEXT:    vpabsd %zmm6, %zmm4
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vpabsd %zmm7, %zmm4
-; AVX512F-NEXT:    vpaddd %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT:    vmovdqa a+1056(%rax), %ymm2
+; AVX512F-NEXT:    vpsadbw b+1056(%rax), %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm3
+; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
 ; AVX512F-NEXT:    addq $4, %rax
 ; AVX512F-NEXT:    jne .LBB2_1
 ; AVX512F-NEXT:  # %bb.2: # %middle.block
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpaddd %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm1
+; AVX512F-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -1135,6 +618,81 @@
   ret i32 %12
 }
 
+define i32 @sad_4i8() nounwind {
+; SSE2-LABEL: sad_4i8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT:    .p2align 4, 0x90
+; SSE2-NEXT:  .LBB4_1: # %vector.body
+; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    addq $4, %rax
+; SSE2-NEXT:    jne .LBB4_1
+; SSE2-NEXT:  # %bb.2: # %middle.block
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: sad_4i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; AVX-NEXT:    .p2align 4, 0x90
+; AVX-NEXT:  .LBB4_1: # %vector.body
+; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    addq $4, %rax
+; AVX-NEXT:    jne .LBB4_1
+; AVX-NEXT:  # %bb.2: # %middle.block
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+  %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+  %1 = bitcast i8* %0 to <4 x i8>*
+  %wide.load = load <4 x i8>, <4 x i8>* %1, align 4
+  %2 = zext <4 x i8> %wide.load to <4 x i32>
+  %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+  %4 = bitcast i8* %3 to <4 x i8>*
+  %wide.load1 = load <4 x i8>, <4 x i8>* %4, align 4
+  %5 = zext <4 x i8> %wide.load1 to <4 x i32>
+  %6 = sub nsw <4 x i32> %2, %5
+  %7 = icmp sgt <4 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %8 = sub nsw <4 x i32> zeroinitializer, %6
+  %9 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> %8
+  %10 = add nsw <4 x i32> %9, %vec.phi
+  %index.next = add i64 %index, 4
+  %11 = icmp eq i64 %index.next, 1024
+  br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+  %h2 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %sum2 = add <4 x i32> %10, %h2
+  %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %sum3 = add <4 x i32> %sum2, %h3
+  %sum = extractelement <4 x i32> %sum3, i32 0
+  ret i32 %sum
+}
+
+
 define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
 ; SSE2-LABEL: sad_nonloop_4i8:
 ; SSE2:       # %bb.0:
@@ -1243,99 +801,16 @@
 define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
 ; SSE2-LABEL: sad_nonloop_32i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu 16(%rdi), %xmm12
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm12, %xmm8
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
-; SSE2-NEXT:    movdqa %xmm12, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    movdqu (%rdx), %xmm7
-; SSE2-NEXT:    movdqu 16(%rdx), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm10
-; SSE2-NEXT:    movdqa %xmm7, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm11
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm13
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm4
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
-; SSE2-NEXT:    psubd %xmm6, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm12
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
-; SSE2-NEXT:    psubd %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm10
-; SSE2-NEXT:    pxor %xmm1, %xmm10
-; SSE2-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm11
-; SSE2-NEXT:    pxor %xmm1, %xmm11
-; SSE2-NEXT:    movdqa %xmm13, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm13
-; SSE2-NEXT:    pxor %xmm1, %xmm13
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    paddd %xmm13, %xmm4
-; SSE2-NEXT:    paddd %xmm10, %xmm4
-; SSE2-NEXT:    paddd %xmm11, %xmm4
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm8
-; SSE2-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm9, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm9
-; SSE2-NEXT:    pxor %xmm1, %xmm9
-; SSE2-NEXT:    movdqa %xmm12, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm12
-; SSE2-NEXT:    pxor %xmm1, %xmm12
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm12, %xmm0
-; SSE2-NEXT:    paddd %xmm8, %xmm0
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm9, %xmm0
+; SSE2-NEXT:    movdqu (%rdx), %xmm0
+; SSE2-NEXT:    movdqu 16(%rdx), %xmm1
+; SSE2-NEXT:    movdqu (%rdi), %xmm2
+; SSE2-NEXT:    psadbw %xmm0, %xmm2
+; SSE2-NEXT:    movdqu 16(%rdi), %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    paddq %xmm2, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: sad_nonloop_32i8:
@@ -1395,6 +870,115 @@
   ret i32 %sum
 }
 
+define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
+; SSE2-LABEL: sad_nonloop_64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqu (%rdx), %xmm0
+; SSE2-NEXT:    movdqu 16(%rdx), %xmm1
+; SSE2-NEXT:    movdqu 32(%rdx), %xmm2
+; SSE2-NEXT:    movdqu 48(%rdx), %xmm3
+; SSE2-NEXT:    movdqu (%rdi), %xmm4
+; SSE2-NEXT:    psadbw %xmm0, %xmm4
+; SSE2-NEXT:    movdqu 16(%rdi), %xmm0
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    movdqu 32(%rdi), %xmm1
+; SSE2-NEXT:    psadbw %xmm2, %xmm1
+; SSE2-NEXT:    movdqu 48(%rdi), %xmm2
+; SSE2-NEXT:    psadbw %xmm3, %xmm2
+; SSE2-NEXT:    paddq %xmm0, %xmm2
+; SSE2-NEXT:    paddq %xmm1, %xmm2
+; SSE2-NEXT:    paddq %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: sad_nonloop_64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
+; AVX1-NEXT:    vpsadbw 48(%rdx), %xmm3, %xmm3
+; AVX1-NEXT:    vpsadbw 16(%rdx), %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw 32(%rdx), %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsadbw (%rdx), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sad_nonloop_64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
+; AVX2-NEXT:    vpsadbw 32(%rdx), %ymm1, %ymm1
+; AVX2-NEXT:    vpsadbw (%rdx), %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad_nonloop_64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpsadbw 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT:    vpsadbw (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: sad_nonloop_64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpsadbw (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %v1 = load <64 x i8>, <64 x i8>* %p, align 1
+  %z1 = zext <64 x i8> %v1 to <64 x i32>
+  %v2 = load <64 x i8>, <64 x i8>* %q, align 1
+  %z2 = zext <64 x i8> %v2 to <64 x i32>
+  %sub = sub nsw <64 x i32> %z1, %z2
+  %isneg = icmp sgt <64 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %neg = sub nsw <64 x i32> zeroinitializer, %sub
+  %abs = select <64 x i1> %isneg, <64 x i32> %sub, <64 x i32> %neg
+  %h64 = shufflevector <64 x i32> %abs, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %sum64 = add <64 x i32> %abs, %h64
+  %h32 = shufflevector <64 x i32> %sum64, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %sum32 = add <64 x i32> %sum64, %h32
+  %h0 = shufflevector <64 x i32> %sum32, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %sum0 = add <64 x i32> %sum32, %h0
+  %h1 = shufflevector <64 x i32> %sum0, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %sum1 = add <64 x i32> %sum0, %h1
+  %h2 = shufflevector <64 x i32> %sum1, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %sum2 = add <64 x i32> %sum1, %h2
+  %h3 = shufflevector <64 x i32> %sum2, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %sum3 = add <64 x i32> %sum2, %h3
+  %sum = extractelement <64 x i32> %sum3, i32 0
+  ret i32 %sum
+}
+
 ; This contains an unrolled sad loop with a non-zero initial value.
 ; DAGCombiner reassociation previously rewrote the adds to move the constant vector further down the tree. This resulted in the vector-reduction flag being lost.
 define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %arg2, <16 x i8>* %arg3) {
diff --git a/llvm/test/ExecutionEngine/OrcMCJIT/test-global-ctors.ll b/llvm/test/ExecutionEngine/OrcMCJIT/test-global-ctors.ll
--- a/llvm/test/ExecutionEngine/OrcMCJIT/test-global-ctors.ll
+++ b/llvm/test/ExecutionEngine/OrcMCJIT/test-global-ctors.ll
@@ -1,8 +1,8 @@
 ; RUN: %lli -jit-kind=orc-mcjit %s > /dev/null
 ; XFAIL: darwin
 @var = global i32 1, align 4
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @ctor_func }]
-@llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @dtor_func }]
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @ctor_func, i8* null }]
+@llvm.global_dtors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @dtor_func, i8* null }]
 
 define i32 @main() nounwind {
 entry:
diff --git a/llvm/test/MC/Mips/cpadd-bad.s b/llvm/test/MC/Mips/cpadd-bad.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Mips/cpadd-bad.s
@@ -0,0 +1,13 @@
+# RUN: not llvm-mc -triple=mips-unknown-linux-gnu %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=mips64-unknown-linux-gnuabin32 %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=mips64-unknown-linux-gnu %s 2>&1 | FileCheck %s
+
+  .text
+  .cpadd $32
+# CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register
+  .cpadd $foo
+# CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected register
+  .cpadd bar
+# CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected register
+  .cpadd $25 foobar
+# CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: unexpected token, expected end of statement
diff --git a/llvm/test/MC/Mips/cpadd.s b/llvm/test/MC/Mips/cpadd.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Mips/cpadd.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple=mips-unknown-linux-gnu -position-independent %s \
+# RUN:   | FileCheck -check-prefix=ASM %s
+# RUN: llvm-mc -triple=mips64-unknown-linux-gnu -position-independent %s \
+# RUN:   | FileCheck -check-prefix=ASM %s
+# RUN: llvm-mc -triple=mips-unknown-linux-gnu %s \
+# RUN:   | FileCheck -check-prefix=ASM %s
+
+# RUN: llvm-mc -triple=mips-unknown-linux-gnu \
+# RUN:         -position-independent -filetype=obj -o - %s \
+# RUN:   | llvm-objdump -d -r - | FileCheck -check-prefix=OBJ32-PIC %s
+# RUN: llvm-mc -triple=mips64-unknown-linux-gnu \
+# RUN:         -position-independent -filetype=obj -o - %s \
+# RUN:   | llvm-objdump -d -r - | FileCheck -check-prefix=OBJ64-PIC %s
+
+# RUN: llvm-mc -triple=mips-unknown-linux-gnu \
+# RUN:         -filetype=obj -o - %s \
+# RUN:   | llvm-objdump -d -r - | FileCheck -check-prefix=OBJ32-NPIC %s
+# RUN: llvm-mc -triple=mips64-unknown-linux-gnu \
+# RUN:         -filetype=obj -o - %s \
+# RUN:   | llvm-objdump -d -r - | FileCheck -check-prefix=OBJ64-NPIC %s
+
+# ASM:       .cpadd $4
+# OBJ32-PIC: addu  $4, $4, $gp
+# OBJ64-PIC: daddu $4, $4, $gp
+# OBJ32-NPIC-NOT: addu
+# OBJ64-NPIC-NOT: daddu
+
+  .text
+  .cpadd $4
diff --git a/llvm/test/MC/Mips/macro-sle.s b/llvm/test/MC/Mips/macro-sle.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Mips/macro-sle.s
@@ -0,0 +1,31 @@
+# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips1 < %s | FileCheck %s
+# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips64 < %s | FileCheck %s
+
+sle   $4, $5
+# CHECK: slt   $4, $5, $4       # encoding: [0x00,0xa4,0x20,0x2a]
+# CHECK: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
+sle   $4, $5, $6
+# CHECK: slt   $4, $6, $5       # encoding: [0x00,0xc5,0x20,0x2a]
+# CHECK: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
+sle   $4, $5, 16
+# CHECK: addiu $4, $zero, 16    # encoding: [0x24,0x04,0x00,0x10]
+# CHECK: slt   $4, $4, $5       # encoding: [0x00,0x85,0x20,0x2a]
+# CHECK: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
+sleu  $4, $5
+# CHECK: sltu  $4, $5, $4       # encoding: [0x00,0xa4,0x20,0x2b]
+# CHECK: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
+sleu  $4, $5, $6
+# CHECK: sltu  $4, $6, $5       # encoding: [0x00,0xc5,0x20,0x2b]
+# CHECK: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
+sleu  $4, $5, 16
+# CHECK: addiu $4, $zero, 16    # encoding: [0x24,0x04,0x00,0x10]
+# CHECK: sltu  $4, $4, $5       # encoding: [0x00,0x85,0x20,0x2b]
+# CHECK: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
+sle   $4, 16
+# CHECK: addiu $1, $zero, 16    # encoding: [0x24,0x01,0x00,0x10]
+# CHECK: slt   $4, $1, $4       # encoding: [0x00,0x24,0x20,0x2a]
+# CHECK: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
+sleu  $4, 16
+# CHECK: addiu $1, $zero, 16    # encoding: [0x24,0x01,0x00,0x10]
+# CHECK: sltu  $4, $1, $4       # encoding: [0x00,0x24,0x20,0x2b]
+# CHECK: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
diff --git a/llvm/test/MC/Mips/macro-sle64.s b/llvm/test/MC/Mips/macro-sle64.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Mips/macro-sle64.s
@@ -0,0 +1,29 @@
+# RUN: not llvm-mc -arch=mips -mcpu=mips1 < %s 2>&1 \
+# RUN:   | FileCheck --check-prefix=MIPS32 %s
+# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips64 < %s \
+# RUN:   | FileCheck --check-prefix=MIPS64 %s
+
+sle   $4, $5, 0x100000000
+# MIPS32: :[[@LINE-1]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+# MIPS64: ori  $4, $zero, 32768  # encoding: [0x34,0x04,0x80,0x00]
+# MIPS64: dsll $4, $4, 17        # encoding: [0x00,0x04,0x24,0x78]
+# MIPS64: slt  $4, $4, $5        # encoding: [0x00,0x85,0x20,0x2a]
+# MIPS64: xori $4, $4, 1         # encoding: [0x38,0x84,0x00,0x01]
+sleu  $4, $5, 0x100000000
+# MIPS32: :[[@LINE-1]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+# MIPS64: ori  $4, $zero, 32768  # encoding: [0x34,0x04,0x80,0x00]
+# MIPS64: dsll $4, $4, 17        # encoding: [0x00,0x04,0x24,0x78]
+# MIPS64: sltu $4, $4, $5        # encoding: [0x00,0x85,0x20,0x2b]
+# MIPS64: xori $4, $4, 1         # encoding: [0x38,0x84,0x00,0x01]
+sle   $4, 0x100000000
+# MIPS32: :[[@LINE-1]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+# MIPS64: ori  $1, $zero, 32768  # encoding: [0x34,0x01,0x80,0x00]
+# MIPS64: dsll $1, $1, 17        # encoding: [0x00,0x01,0x0c,0x78]
+# MIPS64: slt  $4, $1, $4        # encoding: [0x00,0x24,0x20,0x2a]
+# MIPS64: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
+sleu  $4, 0x100000000
+# MIPS32: :[[@LINE-1]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+# MIPS64: ori  $1, $zero, 32768  # encoding: [0x34,0x01,0x80,0x00]
+# MIPS64: dsll $1, $1, 17        # encoding: [0x00,0x01,0x0c,0x78]
+# MIPS64: sltu $4, $1, $4        # encoding: [0x00,0x24,0x20,0x2b]
+# MIPS64: xori  $4, $4, 1        # encoding: [0x38,0x84,0x00,0x01]
diff --git a/llvm/test/MC/Mips/macro-sne.s b/llvm/test/MC/Mips/macro-sne.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Mips/macro-sne.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips1 < %s \
+# RUN:   | FileCheck --check-prefixes=ALL,MIPS32 %s
+# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips64 < %s \
+# RUN:   | FileCheck --check-prefixes=ALL,MIPS64 %s
+
+sne $4, $5, $6
+# ALL:    xor    $4, $5, $6        # encoding: [0x00,0xa6,0x20,0x26]
+# ALL:    sltu   $4, $zero, $4     # encoding: [0x00,0x04,0x20,0x2b]
+sne $4, $zero, $6
+# ALL:    sltu   $4, $zero, $6     # encoding: [0x00,0x06,0x20,0x2b]
+sne $4, $5, $zero
+# ALL:    sltu   $4, $zero, $5     # encoding: [0x00,0x05,0x20,0x2b]
+sne $4, $5, 0
+# ALL:    sltu   $4, $zero, $5     # encoding: [0x00,0x05,0x20,0x2b]
+sne $4, $zero, 1
+# ALL:    addiu  $4, $zero, 1      # encoding: [0x24,0x04,0x00,0x01]
+sne $4, $5, -1
+# MIPS32: addiu  $4, $5, 1         # encoding: [0x24,0xa4,0x00,0x01]
+# MIPS64: daddiu $4, $5, 1         # encoding: [0x64,0xa4,0x00,0x01]
+# ALL:    sltu   $4, $zero, $4     # encoding: [0x00,0x04,0x20,0x2b]
+sne $4, $5, 1
+# ALL:    xori   $4, $5, 1         # encoding: [0x38,0xa4,0x00,0x01]
+# ALL:    sltu   $4, $zero, $4     # encoding: [0x00,0x04,0x20,0x2b]
+sne $4, $5, 0x10000
+# ALL:    lui    $1, 1             # encoding: [0x3c,0x01,0x00,0x01]
+# ALL:    xor    $4, $5, $1        # encoding: [0x00,0xa1,0x20,0x26]
+# ALL:    sltu   $4, $zero, $4     # encoding: [0x00,0x04,0x20,0x2b]
diff --git a/llvm/test/Transforms/InstCombine/all-bits-shift.ll b/llvm/test/Transforms/InstCombine/all-bits-shift.ll
--- a/llvm/test/Transforms/InstCombine/all-bits-shift.ll
+++ b/llvm/test/Transforms/InstCombine/all-bits-shift.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF
-; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON
+; RUN: opt -S -instcombine < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON
-; RUN: opt < %s -instcombine -expensive-combines=0 -S | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF
+; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/InstCombine/call-returned.ll b/llvm/test/Transforms/InstCombine/call-returned.ll
--- a/llvm/test/Transforms/InstCombine/call-returned.ll
+++ b/llvm/test/Transforms/InstCombine/call-returned.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF
-; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON
+; RUN: opt -S -instcombine < %s | FileCheck %s
 
 declare i32 @passthru_i32(i32 returned)
 declare i8* @passthru_p8(i8* returned)
diff --git a/llvm/test/Transforms/InstCombine/expensive-combines.ll b/llvm/test/Transforms/InstCombine/expensive-combines.ll
deleted file mode 100644
--- a/llvm/test/Transforms/InstCombine/expensive-combines.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -instcombine < %s | FileCheck %s --check-prefix=DEFAULT
-; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefix=EXPENSIVE-ON
-; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefix=EXPENSIVE-OFF
-
-define void @test() {
-; DEFAULT-LABEL: @test(
-; DEFAULT-NEXT:    [[CALL:%.*]] = call i32 @passthru(i32 0)
-; DEFAULT-NEXT:    call void @sink(i32 0)
-; DEFAULT-NEXT:    ret void
-;
-; EXPENSIVE-ON-LABEL: @test(
-; EXPENSIVE-ON-NEXT:    [[CALL:%.*]] = call i32 @passthru(i32 0)
-; EXPENSIVE-ON-NEXT:    call void @sink(i32 0)
-; EXPENSIVE-ON-NEXT:    ret void
-;
-; EXPENSIVE-OFF-LABEL: @test(
-; EXPENSIVE-OFF-NEXT:    [[CALL:%.*]] = call i32 @passthru(i32 0)
-; EXPENSIVE-OFF-NEXT:    call void @sink(i32 0)
-; EXPENSIVE-OFF-NEXT:    ret void
-;
-  %call = call i32 @passthru(i32 0)
-  call void @sink(i32 %call)
-  ret void
-}
-
-declare i32 @passthru(i32 returned)
-declare void @sink(i32)
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF
-; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON
+; RUN: opt -S -instcombine < %s | FileCheck %s
 
 define void @test_shl(i1 %x) {
 ; CHECK-LABEL: @test_shl(
diff --git a/llvm/test/Transforms/InstCombine/known-signbit-shift.ll b/llvm/test/Transforms/InstCombine/known-signbit-shift.ll
--- a/llvm/test/Transforms/InstCombine/known-signbit-shift.ll
+++ b/llvm/test/Transforms/InstCombine/known-signbit-shift.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -expensive-combines=0 -S | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF
-; RUN: opt < %s -instcombine -expensive-combines=1 -S | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 ; Result of left shifting a non-negative integer
 ; with nsw flag should also be non-negative
diff --git a/llvm/test/Transforms/InstCombine/out-of-bounds-indexes.ll b/llvm/test/Transforms/InstCombine/out-of-bounds-indexes.ll
--- a/llvm/test/Transforms/InstCombine/out-of-bounds-indexes.ll
+++ b/llvm/test/Transforms/InstCombine/out-of-bounds-indexes.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF
-; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON
+; RUN: opt -S -instcombine < %s | FileCheck %s
 ; Check that we don't crash on unreasonable constant indexes
 
 define i32 @test_out_of_bounds(i32 %a, i1 %x, i1 %y) {
diff --git a/llvm/test/Transforms/InstCombine/phi-shifts.ll b/llvm/test/Transforms/InstCombine/phi-shifts.ll
--- a/llvm/test/Transforms/InstCombine/phi-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/phi-shifts.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF
-; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON
+; RUN: opt -S -instcombine < %s | FileCheck %s
 
 ; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15217
 define i64 @fuzz15217(i1 %cond, i8* %Ptr, i64 %Val) {
diff --git a/llvm/test/Transforms/InstCombine/pr44541.ll b/llvm/test/Transforms/InstCombine/pr44541.ll
--- a/llvm/test/Transforms/InstCombine/pr44541.ll
+++ b/llvm/test/Transforms/InstCombine/pr44541.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -instcombine -expensive-combines=0 -instcombine-infinite-loop-threshold=2 < %s | FileCheck %s
+; RUN: opt -S -instcombine -instcombine-infinite-loop-threshold=2 < %s | FileCheck %s
 
 ; This test used to cause an infinite combine loop.
 
diff --git a/llvm/test/Transforms/InstSimplify/freeze.ll b/llvm/test/Transforms/InstSimplify/freeze.ll
--- a/llvm/test/Transforms/InstSimplify/freeze.ll
+++ b/llvm/test/Transforms/InstSimplify/freeze.ll
@@ -19,6 +19,199 @@
   ret i32 %x
 }
 
+define float @make_const2() {
+; CHECK-LABEL: @make_const2(
+; CHECK-NEXT:    [[X:%.*]] = freeze float 1.000000e+01
+; CHECK-NEXT:    ret float [[X]]
+;
+  %x = freeze float 10.0
+  ret float %x
+}
+
+@glb = constant i32 0
+
+define i32* @make_const_glb() {
+; CHECK-LABEL: @make_const_glb(
+; CHECK-NEXT:    ret i32* @glb
+;
+  %k = freeze i32* @glb
+  ret i32* %k
+}
+
+define i32()* @make_const_fn() {
+; CHECK-LABEL: @make_const_fn(
+; CHECK-NEXT:    [[K:%.*]] = freeze i32 ()* @make_const
+; CHECK-NEXT:    ret i32 ()* [[K]]
+;
+  %k = freeze i32()* @make_const
+  ret i32()* %k
+}
+
+define i32* @make_const_null() {
+; CHECK-LABEL: @make_const_null(
+; CHECK-NEXT:    [[K:%.*]] = freeze i32* null
+; CHECK-NEXT:    ret i32* [[K]]
+;
+  %k = freeze i32* null
+  ret i32* %k
+}
+
+define <2 x i32> @constvector() {
+; CHECK-LABEL: @constvector(
+; CHECK-NEXT:    [[X:%.*]] = freeze <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[X]]
+;
+  %x = freeze <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %x
+}
+
+define <2 x i32> @constvector_noopt() {
+; CHECK-LABEL: @constvector_noopt(
+; CHECK-NEXT:    [[X:%.*]] = freeze <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    ret <2 x i32> [[X]]
+;
+  %x = freeze <2 x i32> <i32 0, i32 undef>
+  ret <2 x i32> %x
+}
+
+define void @alloca() {
+; CHECK-LABEL: @alloca(
+; CHECK-NEXT:    [[P:%.*]] = alloca i8
+; CHECK-NEXT:    [[Y:%.*]] = freeze i8* [[P]]
+; CHECK-NEXT:    call void @f3(i8* [[Y]])
+; CHECK-NEXT:    ret void
+;
+  %p = alloca i8
+  %y = freeze i8* %p
+  call void @f3(i8* %y)
+  ret void
+}
+
+define i8* @gep() {
+; CHECK-LABEL: @gep(
+; CHECK-NEXT:    [[P:%.*]] = alloca [4 x i8]
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr [4 x i8], [4 x i8]* [[P]], i32 0, i32 6
+; CHECK-NEXT:    [[Q2:%.*]] = freeze i8* [[Q]]
+; CHECK-NEXT:    ret i8* [[Q2]]
+;
+  %p = alloca [4 x i8]
+  %q = getelementptr [4 x i8], [4 x i8]* %p, i32 0, i32 6
+  %q2 = freeze i8* %q
+  ret i8* %q2
+}
+
+define i8* @gep_noopt(i32 %arg) {
+; CHECK-LABEL: @gep_noopt(
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr [4 x i8], [4 x i8]* null, i32 0, i32 [[ARG:%.*]]
+; CHECK-NEXT:    [[Q2:%.*]] = freeze i8* [[Q]]
+; CHECK-NEXT:    ret i8* [[Q2]]
+;
+  %q = getelementptr [4 x i8], [4 x i8]* null, i32 0, i32 %arg
+  %q2 = freeze i8* %q
+  ret i8* %q2
+}
+
+define i8* @gep_inbounds() {
+; CHECK-LABEL: @gep_inbounds(
+; CHECK-NEXT:    [[P:%.*]] = alloca [4 x i8]
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[P]], i32 0, i32 0
+; CHECK-NEXT:    [[Q2:%.*]] = freeze i8* [[Q]]
+; CHECK-NEXT:    ret i8* [[Q2]]
+;
+  %p = alloca [4 x i8]
+  %q = getelementptr inbounds [4 x i8], [4 x i8]* %p, i32 0, i32 0
+  %q2 = freeze i8* %q
+  ret i8* %q2
+}
+
+define i8* @gep_inbounds_noopt(i32 %arg) {
+; CHECK-LABEL: @gep_inbounds_noopt(
+; CHECK-NEXT:    [[P:%.*]] = alloca [4 x i8]
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[P]], i32 0, i32 [[ARG:%.*]]
+; CHECK-NEXT:    [[Q2:%.*]] = freeze i8* [[Q]]
+; CHECK-NEXT:    ret i8* [[Q2]]
+;
+  %p = alloca [4 x i8]
+  %q = getelementptr inbounds [4 x i8], [4 x i8]* %p, i32 0, i32 %arg
+  %q2 = freeze i8* %q
+  ret i8* %q2
+}
+
+define i32* @gep_inbounds_null() {
+; CHECK-LABEL: @gep_inbounds_null(
+; CHECK-NEXT:    [[K:%.*]] = freeze i32* null
+; CHECK-NEXT:    ret i32* [[K]]
+;
+  %p = getelementptr inbounds i32, i32* null, i32 0
+  %k = freeze i32* %p
+  ret i32* %k
+}
+
+define i32* @gep_inbounds_null_noopt(i32* %p) {
+; CHECK-LABEL: @gep_inbounds_null_noopt(
+; CHECK-NEXT:    [[K:%.*]] = freeze i32* [[P:%.*]]
+; CHECK-NEXT:    ret i32* [[K]]
+;
+  %q = getelementptr inbounds i32, i32* %p, i32 0
+  %k = freeze i32* %q
+  ret i32* %k
+}
+
+define i1 @icmp(i32 %a, i32 %b) {
+; CHECK-LABEL: @icmp(
+; CHECK-NEXT:    [[A_FR:%.*]] = freeze i32 [[A:%.*]]
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i32 [[B:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[A_FR]], [[B_FR]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %a.fr = freeze i32 %a
+  %b.fr = freeze i32 %b
+  %c = icmp eq i32 %a.fr, %b.fr
+  %c.fr = freeze i1 %c
+  ret i1 %c.fr
+}
+
+define i1 @icmp_noopt(i32 %a, i32 %b) {
+; CHECK-LABEL: @icmp_noopt(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[C_FR:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    ret i1 [[C_FR]]
+;
+  %c = icmp eq i32 %a, %b
+  %c.fr = freeze i1 %c
+  ret i1 %c.fr
+}
+
+define i1 @fcmp(float %x, float %y) {
+; CHECK-LABEL: @fcmp(
+; CHECK-NEXT:    [[FX:%.*]] = freeze float [[X:%.*]]
+; CHECK-NEXT:    [[FY:%.*]] = freeze float [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq float [[FX]], [[FY]]
+; CHECK-NEXT:    [[FC:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    ret i1 [[FC]]
+;
+  %fx = freeze float %x
+  %fy = freeze float %y
+  %c = fcmp oeq float %fx, %fy
+  %fc = freeze i1 %c
+  ret i1 %fc
+}
+
+define i1 @fcmp_noopt(float %x, float %y) {
+; CHECK-LABEL: @fcmp_noopt(
+; CHECK-NEXT:    [[FX:%.*]] = freeze float [[X:%.*]]
+; CHECK-NEXT:    [[FY:%.*]] = freeze float [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = fcmp nnan oeq float [[FX]], [[FY]]
+; CHECK-NEXT:    [[FC:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    ret i1 [[FC]]
+;
+  %fx = freeze float %x
+  %fy = freeze float %y
+  %c = fcmp nnan oeq float %fx, %fy
+  %fc = freeze i1 %c
+  ret i1 %fc
+}
+
 define i1 @brcond(i1 %c, i1 %c2) {
 ; CHECK-LABEL: @brcond(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[A:%.*]], label [[B:%.*]]
@@ -40,6 +233,66 @@
   ret i1 %f2
 }
 
+define i32 @phi(i1 %cond, i1 %cond2, i32 %a0, i32 %a1) {
+; CHECK-LABEL: @phi(
+; CHECK-NEXT:  ENTRY:
+; CHECK-NEXT:    [[A0_FR:%.*]] = freeze i32 [[A0:%.*]]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       BB1:
+; CHECK-NEXT:    [[A1_FR:%.*]] = freeze i32 [[A1:%.*]]
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[BB2]], label [[EXIT:%.*]]
+; CHECK:       BB2:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i32 [ [[A0_FR]], [[ENTRY:%.*]] ], [ [[A1_FR]], [[BB1]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       EXIT:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ [[A0_FR]], [[BB1]] ], [ [[PHI1]], [[BB2]] ]
+; CHECK-NEXT:    [[PHI2_FR:%.*]] = freeze i32 [[PHI2]]
+; CHECK-NEXT:    ret i32 [[PHI2_FR]]
+;
+ENTRY:
+  %a0.fr = freeze i32 %a0
+  br i1 %cond, label %BB1, label %BB2
+BB1:
+  %a1.fr = freeze i32 %a1
+  br i1 %cond2, label %BB2, label %EXIT
+BB2:
+  %phi1 = phi i32 [%a0.fr, %ENTRY], [%a1.fr, %BB1]
+  br label %EXIT
+EXIT:
+  %phi2 = phi i32 [%a0.fr, %BB1], [%phi1, %BB2]
+  %phi2.fr = freeze i32 %phi2
+  ret i32 %phi2.fr
+}
+
+define i32 @phi_noopt(i1 %cond, i1 %cond2, i32 %a0, i32 %a1) {
+; CHECK-LABEL: @phi_noopt(
+; CHECK-NEXT:  ENTRY:
+; CHECK-NEXT:    [[A0_FR:%.*]] = freeze i32 [[A0:%.*]]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       BB1:
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[BB2]], label [[EXIT:%.*]]
+; CHECK:       BB2:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i32 [ [[A0_FR]], [[ENTRY:%.*]] ], [ [[A1:%.*]], [[BB1]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       EXIT:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ [[A0_FR]], [[BB1]] ], [ [[PHI1]], [[BB2]] ]
+; CHECK-NEXT:    [[PHI2_FR:%.*]] = freeze i32 [[PHI2]]
+; CHECK-NEXT:    ret i32 [[PHI2_FR]]
+;
+ENTRY:
+  %a0.fr = freeze i32 %a0
+  br i1 %cond, label %BB1, label %BB2
+BB1:
+  br i1 %cond2, label %BB2, label %EXIT
+BB2:
+  %phi1 = phi i32 [%a0.fr, %ENTRY], [%a1, %BB1]
+  br label %EXIT
+EXIT:
+  %phi2 = phi i32 [%a0.fr, %BB1], [%phi1, %BB2]
+  %phi2.fr = freeze i32 %phi2
+  ret i32 %phi2.fr
+}
+
 define i32 @brcond_switch(i32 %x) {
 ; CHECK-LABEL: @brcond_switch(
 ; CHECK-NEXT:    switch i32 [[X:%.*]], label [[EXIT:%.*]] [
@@ -81,3 +334,4 @@
 }
 declare void @f1(i1)
 declare void @f2()
+declare void @f3(i8*)
diff --git a/llvm/test/Transforms/SCCP/apint-xor.ll b/llvm/test/Transforms/SCCP/apint-xor.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/apint-xor.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+; Test some XOR simplifications / range propagation.
+define void@xor1(i1 %cmp) {
+; CHECK-LABEL: @xor1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[IF_TRUE:%.*]], label [[END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cmp, label %if.true, label %end
+
+if.true:
+  br label %end
+
+end:
+  %p = phi i32 [ 11, %entry ], [ 11, %if.true]
+  %xor.1 = xor i32 %p, %p
+  %c.1 = icmp eq i32 %xor.1, 0
+  call void @use(i1 %c.1)
+  %c.2 = icmp eq i32 %xor.1, 10
+  call void @use(i1 %c.2)
+  %xor.2 = xor i32 %p, 1
+  %c.3 = icmp eq i32 %xor.2, 11
+  call void @use(i1 %c.3)
+  %c.4 = icmp eq i32 %xor.2, 10
+  call void @use(i1 %c.4)
+  ret void
+}
+
+declare void @use(i1)
diff --git a/llvm/test/Transforms/SCCP/conditions-iter-order.ll b/llvm/test/Transforms/SCCP/conditions-iter-order.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/conditions-iter-order.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -ipsccp -S %s | FileCheck %s
+
+declare noalias i8* @malloc(i64)
+
+; Make sure we can eliminate `%tmp17 = icmp ult i32 %tmp10, 3`.
+
+declare void @use(i1)
+
+define internal i32* @spam(i32* %arg) {
+; CHECK-LABEL: @spam(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = call i8* @malloc(i64 10368)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP]] to i32*
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i32* [[TMP7]], null
+; CHECK-NEXT:    br i1 [[TMP10]], label [[BB17:%.*]], label [[BB13:%.*]]
+; CHECK:       bb13:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[ARG]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 1
+; CHECK-NEXT:    br label [[BB30:%.*]]
+; CHECK:       bb17:
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[ARG]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 8
+; CHECK-NEXT:    br i1 [[TMP18]], label [[BB30]], label [[BB13]]
+; CHECK:       bb30:
+; CHECK-NEXT:    ret i32* [[TMP1]]
+;
+bb:
+  %tmp = call i8* @malloc(i64 10368)
+  %tmp1 = bitcast i8* %tmp to i32*
+  %tmp4 = getelementptr inbounds i32, i32* %arg, i32 0
+  %tmp5 = load i32, i32* %tmp4, align 8
+  %tmp6 = add i32 %tmp5, 1
+  %tmp7 = getelementptr inbounds i32, i32* %arg, i32 1
+  %tmp10 = icmp ne i32* %tmp7, null
+  br i1 %tmp10, label %bb17, label %bb13
+
+bb13:
+  %tmp14 = getelementptr inbounds i32, i32* %arg, i32 2
+  %tmp15 = load i32, i32* %tmp14, align 8
+  %tmp16 = add i32 %tmp15, 1
+  br label %bb30
+
+bb17:
+  %tmp18 = icmp eq i32 %tmp6, %tmp5
+  %tmp19 = getelementptr inbounds i32, i32* %arg, i32 3
+  %tmp20 = load i32, i32* %tmp19, align 8
+  br i1 %tmp18, label %bb30, label %bb13
+
+bb30:
+  ret i32* %tmp1
+}
+
+define void @spam.1(i32* %arg) {
+bb:
+  %tmp = alloca i8*, align 8
+  %tmp4 = call i32* @spam(i32* %arg)
+  br label %bb6
+
+bb6:                                              ; preds = %bb5
+  %tmp7 = getelementptr inbounds i32, i32* %tmp4, i32 1
+  %tmp10 = load i32, i32* %tmp7, align 8
+  %tmp11 = icmp ne i32 %tmp10, 0
+  br i1 %tmp11, label %bb6, label %bb15
+
+bb15:                                             ; preds = %bb12
+  %tmp17 = icmp ult i32 %tmp10, 3
+  call void @use(i1 %tmp17)
+  br i1 %tmp17, label %bb6, label %bb24
+
+bb24:
+  ret void
+}
diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+
+declare void @use(i1)
+
+define void @f1(i32 %a, i32 %b) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_255:%.*]] = and i32 [[A:%.*]], 255
+; CHECK-NEXT:    [[A_2:%.*]] = add i32 [[A_255]], 20
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i32 [[B:%.*]], [[A_2]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; CHECK:       true:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    call void @use(i1 [[C_2]])
+; CHECK-NEXT:    [[C_3:%.*]] = icmp eq i32 [[B]], 20
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    [[C_4:%.*]] = icmp eq i32 [[B]], 21
+; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    ret void
+; CHECK:       false:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.255 = and i32 %a, 255
+  %a.2 = add i32 %a.255, 20
+  %c.1 = icmp ugt i32 %b, %a.2
+  br i1 %c.1, label %true, label %false
+
+true:
+  %c.2 = icmp eq i32 %b, 0
+  call void @use(i1 %c.2)
+  %c.3 = icmp eq i32 %b, 20
+  call void @use(i1 %c.3)
+  %c.4 = icmp eq i32 %b, 21
+  call void @use(i1 %c.4)
+  ret void
+
+false:
+  ret void
+}
+
+
+define void @f2(i8* %a) {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8* [[A:%.*]], null
+; CHECK-NEXT:    br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; CHECK:       true:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    ret void
+; CHECK:       false:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c.1 = icmp eq i8* %a, null
+  br i1 %c.1, label %true, label %false
+
+true:
+  %c.2 = icmp eq i8*%a, null
+  call void @use(i1 %c.2)
+  ret void
+
+false:
+  ret void
+}
+
+define i8* @f3(i8* %a, i8* %b, i1 %c) {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8* [[A:%.*]], null
+; CHECK-NEXT:    br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; CHECK:       true:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[TRUE_2:%.*]], label [[FALSE_2:%.*]]
+; CHECK:       true.2:
+; CHECK-NEXT:    br label [[EXIT_2:%.*]]
+; CHECK:       false.2:
+; CHECK-NEXT:    br label [[EXIT_2]]
+; CHECK:       exit.2:
+; CHECK-NEXT:    [[P:%.*]] = phi i8* [ null, [[TRUE_2]] ], [ [[B:%.*]], [[FALSE_2]] ]
+; CHECK-NEXT:    ret i8* [[P]]
+; CHECK:       false:
+; CHECK-NEXT:    ret i8* null
+;
+entry:
+  %c.1 = icmp eq i8* %a, null
+  br i1 %c.1, label %true, label %false
+
+true:
+  br i1 %c, label %true.2, label %false.2
+
+true.2:
+  br label %exit.2
+
+false.2:
+  br label %exit.2
+
+exit.2:
+  %p = phi i8* [ %a, %true.2 ], [ %b, %false.2 ]
+  ret i8* %p
+
+false:
+  ret i8* null
+}
+
+
+define i32 @f5(i64 %sz) {
+; CHECK-LABEL: @f5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 4088, [[SZ:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 4088, [[SZ]]
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_TRUE]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[COND]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %cmp = icmp ugt i64 4088, %sz
+  br i1 %cmp, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  %div = udiv i64 4088, %sz
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %div, %cond.true ], [ 1, %entry ]
+  %conv = trunc i64 %cond to i32
+  ret i32 %conv
+}
+
+define void @f6(i32 %b) {
+; CHECK-LABEL: @f6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i32 [[B:%.*]], 20
+; CHECK-NEXT:    br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; CHECK:       true:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    ret void
+; CHECK:       false:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = add i32 10, 10
+  %c.1 = icmp ugt i32 %b, %a
+  br i1 %c.1, label %true, label %false
+
+true:
+  %c.2 = icmp eq i32 %a, 20
+  call void @use(i1 %c.2)
+  ret void
+
+false:
+  ret void
+}
+
+
+define void @loop.1() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.cleanup13, %if.then
+  %i.0 = phi i32 [ 0, %entry ], [ %inc27, %for.cond.cleanup13 ]
+  %cmp9 = icmp sle i32 %i.0, 3
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond11
+
+for.cond11:                                       ; preds = %arrayctor.cont21, %for.body
+   br label %for.cond.cleanup13
+
+for.cond.cleanup13:                               ; preds = %for.cond11
+  %inc27 = add nsw i32 %i.0, 1
+  br label %for.cond
+}
+
+
+define void @loop() {
+; CHECK-LABEL: @loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC27:%.*]], [[FOR_COND_CLEANUP13:%.*]] ]
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sle i32 [[I_0]], 3
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    br label [[FOR_COND11:%.*]]
+; CHECK:       for.cond11:
+; CHECK-NEXT:    [[J_0:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY14:%.*]] ]
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp slt i32 [[J_0]], 2
+; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_BODY14]], label [[FOR_COND_CLEANUP13]]
+; CHECK:       for.cond.cleanup13:
+; CHECK-NEXT:    [[INC27]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.body14:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[J_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND11]]
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.cleanup13, %if.then
+  %i.0 = phi i32 [ 0, %entry ], [ %inc27, %for.cond.cleanup13 ]
+  %cmp9 = icmp sle i32 %i.0, 3
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond11
+
+for.cond11:                                       ; preds = %arrayctor.cont21, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.body14 ]
+  %cmp12 = icmp slt i32 %j.0, 2
+  br i1 %cmp12, label %for.body14, label %for.cond.cleanup13
+
+for.cond.cleanup13:                               ; preds = %for.cond11
+  %inc27 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.body14:
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond11
+}
+
+define i32 @foo(i64 %sz) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 4088, [[SZ:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 4088, [[SZ]]
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_TRUE]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[COND]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %cmp = icmp ugt i64 4088, %sz
+  br i1 %cmp, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  %div = udiv i64 4088, %sz
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %div, %cond.true ], [ 1, %entry ]
+  %conv = trunc i64 %cond to i32
+  ret i32 %conv
+}
+
+define i32 @bar(i64 %sz) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[SZ:%.*]], 4088
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 4088, [[SZ]]
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_TRUE]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[COND]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %cmp = icmp ugt i64 %sz, 4088
+  br i1 %cmp, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  %div = udiv i64 4088, %sz
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %div, %cond.true ], [ 1, %entry ]
+  %conv = trunc i64 %cond to i32
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -69,31 +69,51 @@
 ; }
 
 define i32 @test_mul(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_mul(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
-; CHECK-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
-; CHECK-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
-; CHECK-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
-; CHECK-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
-; CHECK-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[MUL_714]]
+; AVX-LABEL: @test_mul(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = mul <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = mul <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = mul <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_mul(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; SSE-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; SSE-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; SSE-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; SSE-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; SSE-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; SSE-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; SSE-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
+; SSE-NEXT:    ret i32 [[MUL_714]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
diff --git a/llvm/test/Verifier/bswap.ll b/llvm/test/Verifier/bswap.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Verifier/bswap.ll
@@ -0,0 +1,53 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+; Check handling of bswap with unsupported sizes.
+
+declare i8 @llvm.bswap.i8(i8)
+declare <2 x i8> @llvm.bswap.v2i8(<2 x i8>)
+
+declare i12 @llvm.bswap.i12(i12)
+declare <2 x i12> @llvm.bswap.v2i12(<2 x i12>)
+
+declare i18 @llvm.bswap.i18(i18)
+declare <2 x i18> @llvm.bswap.v2i18(<2 x i18>)
+
+define i8 @bswap_i8(i8 %arg) {
+; CHECK: bswap must be an even number of bytes
+; CHECK-NEXT: %res = call i8 @llvm.bswap.i8(i8 %arg)
+  %res = call i8 @llvm.bswap.i8(i8 %arg)
+  ret i8 %res
+}
+
+define <2 x i8> @bswap_v2i8(<2 x i8> %arg) {
+; CHECK: bswap must be an even number of bytes
+; CHECK-NEXT: %res = call <2 x i8> @llvm.bswap.v2i8(<2 x i8> %arg)
+  %res = call <2 x i8> @llvm.bswap.v2i8(<2 x i8> %arg)
+  ret <2 x i8> %res
+}
+
+define i12 @bswap_i12(i12 %arg) {
+; CHECK: bswap must be an even number of bytes
+; CHECK-NEXT: %res = call i12 @llvm.bswap.i12(i12 %arg)
+  %res = call i12 @llvm.bswap.i12(i12 %arg)
+  ret i12 %res
+}
+
+define <2 x i12> @bswap_v2i12(<2 x i12> %arg) {
+; CHECK: bswap must be an even number of bytes
+; CHECK-NEXT: %res = call <2 x i12> @llvm.bswap.v2i12(<2 x i12> %arg)
+  %res = call <2 x i12> @llvm.bswap.v2i12(<2 x i12> %arg)
+  ret <2 x i12> %res
+}
+
+define i18 @bswap_i18(i18 %arg) {
+; CHECK: bswap must be an even number of bytes
+; CHECK-NEXT: %res = call i18 @llvm.bswap.i18(i18 %arg)
+  %res = call i18 @llvm.bswap.i18(i18 %arg)
+  ret i18 %res
+}
+
+define <2 x i18> @bswap_v2i18(<2 x i18> %arg) {
+; CHECK: bswap must be an even number of bytes
+; CHECK-NEXT: %res = call <2 x i18> @llvm.bswap.v2i18(<2 x i18> %arg)
+  %res = call <2 x i18> @llvm.bswap.v2i18(<2 x i18> %arg)
+  ret <2 x i18> %res
+}
diff --git a/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s b/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s
--- a/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s
+++ b/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s
@@ -1,43 +1,35 @@
-# RUN: llvm-mc -triple=powerpc64le-unknown-linux -filetype=obj %s -o %t.o
-# RUN: llvm-objdump -d %t.o | FileCheck %s
-
-# RUN: llvm-mc -triple=powerpc64-unknown-linux -filetype=obj %s -o %t.o
-# RUN: llvm-objdump -d %t.o | FileCheck %s
-
-# RUN: llvm-mc -triple=powerpc-unknown-linux -filetype=obj %s -o %t.o
-# RUN: llvm-objdump -d %t.o | FileCheck %s
-
-# CHECK: {{0*}}00000000 <callee_back>:
-# CHECK: 18: {{.*}} bl .-24
-# CHECK: 20: {{.*}} bl .+16
-# CHECK: {{0*}}00000030 <callee_forward>:
-
-        .text
-        .global caller
-        .type caller,@function
-        .type callee_forward,@function
-        .type callee_back,@function
-
-        .p2align 4
-callee_back:
-        li 3, 55
-        blr
-
-        .p2align 4
-caller:
-.Lgep:
-        addis 2, 12, .TOC.-.Lgep@ha
-        addi 2, 2, .TOC.-.Lgep@l
-.Llep:
-        .localentry caller, .Llep-.Lgep
-        bl callee_back
-        mr 31, 3
-        bl callee_forward
-        add 3, 3, 31
-        blr
-
-        .p2align 4
-callee_forward:
-        li 3, 66
-        blr
+# RUN: llvm-mc -triple=powerpc -filetype=obj %s -o %t.32.o
+# RUN: llvm-objdump -d --no-show-raw-insn %t.32.o | FileCheck --check-prefixes=ELF32,CHECK %s
 
+# RUN: llvm-mc -triple=powerpc64le -filetype=obj %s -o %t.64.o
+# RUN: llvm-objdump -d --no-show-raw-insn %t.64.o | FileCheck --check-prefixes=ELF64,CHECK %s
+
+# RUN: llvm-mc -triple=powerpc64 -filetype=obj %s -o %t.64.o
+# RUN: llvm-objdump -d --no-show-raw-insn %t.64.o | FileCheck --check-prefixes=ELF64,CHECK %s
+
+# CHECK-LABEL: <bl>:
+# ELF32-NEXT:   bl .-4
+# ELF64-NEXT:   bl .-4
+# CHECK-NEXT:   bl .+0
+# CHECK-NEXT:   bl .+4
+
+bl:
+  bl .-4
+  bl .
+  bl .+4
+
+# CHECK-LABEL: <b>:
+# CHECK-NEXT:   b .+67108860
+# CHECK-NEXT:   b .+0
+# CHECK-NEXT:   b .+4
+
+b:
+  b .-4
+  b .
+  b .+4
+
+# CHECK-LABEL: <bt>:
+# CHECK-NEXT:   bt 2, .+65532
+
+bt:
+  bt 2, .-4
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
@@ -2516,4 +2516,53 @@
   EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
 }
 
+// Test widening of G_UNMERGE_VALUES
+TEST_F(GISelMITest, WidenUnmerge) {
+  setUp();
+  if (!TM)
+    return;
+
+  DefineLegalizerInfo(A, {});
+
+  // Check that widening G_UNMERGE_VALUES to a larger type than the source type
+  // works as expected
+  LLT P0{LLT::pointer(0, 64)};
+  LLT S32{LLT::scalar(32)};
+  LLT S96{LLT::scalar(96)};
+
+  auto IntToPtr = B.buildIntToPtr(P0, Copies[0]);
+  auto UnmergePtr = B.buildUnmerge(S32, IntToPtr);
+  auto UnmergeScalar = B.buildUnmerge(S32, Copies[0]);
+
+  AInfo Info(MF->getSubtarget());
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer, B);
+
+  // Perform Legalization
+  EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized,
+            Helper.widenScalar(*UnmergePtr, 0, S96));
+
+  EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized,
+            Helper.widenScalar(*UnmergeScalar, 0, S96));
+
+  const auto *CheckStr = R"(
+  CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY
+  CHECK: [[PTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[COPY]]
+  CHECK: [[INT:%[0-9]+]]:_(s64) = G_PTRTOINT [[PTR]]
+  CHECK: [[ANYEXT:%[0-9]+]]:_(s96) = G_ANYEXT [[INT]]
+  CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]]
+  CHECK: [[C:%[0-9]+]]:_(s96) = G_CONSTANT i96 32
+  CHECK: [[LSHR:%[0-9]+]]:_(s96) = G_LSHR [[ANYEXT]]:_, [[C]]
+  CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]]
+  CHECK: [[ANYEXT:%[0-9]+]]:_(s96) = G_ANYEXT [[COPY]]
+  CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]]
+  CHECK: [[C:%[0-9]+]]:_(s96) = G_CONSTANT i96 32
+  CHECK: [[LSHR:%[0-9]+]]:_(s96) = G_LSHR [[ANYEXT]]:_, [[C]]
+  CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]]
+  )";
+
+  // Check
+  EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
+}
+
 } // namespace
diff --git a/mlir/docs/LangRef.md b/mlir/docs/LangRef.md
--- a/mlir/docs/LangRef.md
+++ b/mlir/docs/LangRef.md
@@ -944,7 +944,7 @@
 following figure shows an index map which maps a 2-dimensional index from a 2x2
 index space to a 3x3 index space, using symbols `S0` and `S1` as offsets.
 
-![Index Map Example](includes/img/index-map.svg)
+![Index Map Example](/includes/img/index-map.svg)
 
 The number of domain dimensions and range dimensions of an index map can be
 different, but must match the number of dimensions of the input and output index
diff --git a/mlir/include/mlir/Analysis/Passes.h b/mlir/include/mlir/Analysis/Passes.h
deleted file mode 100644
--- a/mlir/include/mlir/Analysis/Passes.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This header file defines prototypes that expose pass constructors in the
-// analysis library.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_ANALYSIS_PASSES_H
-#define MLIR_ANALYSIS_PASSES_H
-
-#include "mlir/Support/LLVM.h"
-#include <memory>
-
-namespace mlir {
-
-class FuncOp;
-template <typename T> class OpPassBase;
-
-/// Creates a pass to check memref accesses in a Function.
-std::unique_ptr<OpPassBase<FuncOp>> createTestMemRefBoundCheckPass();
-
-/// Creates a pass to check memref access dependences in a Function.
-std::unique_ptr<OpPassBase<FuncOp>> createTestMemRefDependenceCheckPass();
-
-} // end namespace mlir
-
-#endif // MLIR_ANALYSIS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -32,7 +32,7 @@
 std::unique_ptr<OpPassBase<FuncOp>> createSimplifyAffineStructuresPass();
 
 /// Creates a loop invariant code motion pass that hoists loop invariant
-/// instructions out of affine loop.
+/// operations out of affine loops.
 std::unique_ptr<OpPassBase<FuncOp>> createAffineLoopInvariantCodeMotionPass();
 
 /// Performs packing (or explicit copying) of accessed memref regions into
@@ -43,6 +43,31 @@
     unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024,
     uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max());
 
+/// Creates a pass to perform tiling on loop nests.
+std::unique_ptr<OpPassBase<FuncOp>>
+createLoopTilingPass(uint64_t cacheSizeBytes);
+
+/// Creates a loop unrolling pass with the provided parameters.
+/// 'getUnrollFactor' is a function callback for clients to supply a function
+/// that computes an unroll factor - the callback takes precedence over unroll
+/// factors supplied through other means. If -1 is passed as the unrollFactor
+/// and no callback is provided, anything passed from the command-line (if at
+/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
+std::unique_ptr<OpPassBase<FuncOp>> createLoopUnrollPass(
+    int unrollFactor = -1, int unrollFull = -1,
+    const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
+
+/// Creates a loop unroll jam pass to unroll jam by the specified factor. A
+/// factor of -1 lets the pass use the default factor or the one on the command
+/// line if provided.
+std::unique_ptr<OpPassBase<FuncOp>>
+createLoopUnrollAndJamPass(int unrollJamFactor = -1);
+
+/// Creates a pass to vectorize loops, operations and data types using a
+/// target-independent, n-D super-vector abstraction.
+std::unique_ptr<OpPassBase<FuncOp>>
+createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize);
+
 } // end namespace mlir
 
 #endif // MLIR_DIALECT_AFFINE_RANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/IR/StandardTypes.h b/mlir/include/mlir/IR/StandardTypes.h
--- a/mlir/include/mlir/IR/StandardTypes.h
+++ b/mlir/include/mlir/IR/StandardTypes.h
@@ -416,8 +416,8 @@
 
 /// MemRef types represent a region of memory that have a shape with a fixed
 /// number of dimensions. Each shape element can be a non-negative integer or
-/// unknown (represented by any negative integer). MemRef types also have an
-/// affine map composition, represented as an array AffineMap pointers.
+/// unknown (represented by -1). MemRef types also have an affine map
+/// composition, represented as an array AffineMap pointers.
 class MemRefType : public Type::TypeBase<MemRefType, BaseMemRefType,
                                          detail::MemRefTypeStorage> {
 public:
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -14,7 +14,6 @@
 #ifndef MLIR_INITALLPASSES_H_
 #define MLIR_INITALLPASSES_H_
 
-#include "mlir/Analysis/Passes.h"
 #include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h"
 #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
@@ -62,7 +61,7 @@
   // Init general passes
   createCanonicalizerPass();
   createCSEPass();
-  createVectorizePass({});
+  createSuperVectorizePass({});
   createLoopUnrollPass();
   createLoopUnrollAndJamPass();
   createSimplifyAffineStructuresPass();
@@ -98,10 +97,6 @@
 
   // CUDA
   createConvertGpuLaunchFuncToCudaCallsPass();
-#if MLIR_CUDA_CONVERSIONS_ENABLED
-  createConvertGPUKernelToCubinPass(
-      [](const std::string &, Location, StringRef) { return nullptr; });
-#endif
   createLowerGpuOpsToNVVMOpsPass();
 
   // Linalg
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -32,27 +32,6 @@
 /// Creates a pass to perform common sub expression elimination.
 std::unique_ptr<Pass> createCSEPass();
 
-/// Creates a pass to vectorize loops, operations and data types using a
-/// target-independent, n-D super-vector abstraction.
-std::unique_ptr<OpPassBase<FuncOp>>
-createVectorizePass(ArrayRef<int64_t> virtualVectorSize);
-
-/// Creates a loop unrolling pass with the provided parameters.
-/// 'getUnrollFactor' is a function callback for clients to supply a function
-/// that computes an unroll factor - the callback takes precedence over unroll
-/// factors supplied through other means. If -1 is passed as the unrollFactor
-/// and no callback is provided, anything passed from the command-line (if at
-/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
-std::unique_ptr<OpPassBase<FuncOp>> createLoopUnrollPass(
-    int unrollFactor = -1, int unrollFull = -1,
-    const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
-
-/// Creates a loop unroll jam pass to unroll jam by the specified factor. A
-/// factor of -1 lets the pass use the default factor or the one on the command
-/// line if provided.
-std::unique_ptr<OpPassBase<FuncOp>>
-createLoopUnrollAndJamPass(int unrollJamFactor = -1);
-
 /// Creates a loop fusion pass which fuses loops. Buffers of size less than or
 /// equal to `localBufSizeThreshold` are promoted to memory space
 /// `fastMemorySpace'.
@@ -65,10 +44,6 @@
 /// instructions out of the loop.
 std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
 
-/// Creates a loop invariant code motion pass that hoists loop invariant
-/// instructions out of affine loop.
-std::unique_ptr<OpPassBase<FuncOp>> createAffineLoopInvariantCodeMotionPass();
-
 /// Creates a pass to pipeline explicit movement of data across levels of the
 /// memory hierarchy.
 std::unique_ptr<OpPassBase<FuncOp>> createPipelineDataTransferPass();
@@ -78,10 +53,6 @@
 /// primitives).
 std::unique_ptr<OpPassBase<FuncOp>> createLowerAffinePass();
 
-/// Creates a pass to perform tiling on loop nests.
-std::unique_ptr<OpPassBase<FuncOp>>
-createLoopTilingPass(uint64_t cacheSizeBytes);
-
 /// Creates a pass that transforms perfectly nested loops with independent
 /// bounds into a single loop.
 std::unique_ptr<OpPassBase<FuncOp>> createLoopCoalescingPass();
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -49,8 +49,7 @@
 class GpuKernelToCubinPass
     : public OperationPass<GpuKernelToCubinPass, gpu::GPUModuleOp> {
 public:
-  GpuKernelToCubinPass(
-      CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
+  GpuKernelToCubinPass(CubinGenerator cubinGenerator)
       : cubinGenerator(cubinGenerator) {}
 
   void runOnOperation() override {
@@ -76,9 +75,6 @@
   }
 
 private:
-  static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
-                                                Location, StringRef);
-
   std::string translateModuleToPtx(llvm::Module &module,
                                    llvm::TargetMachine &target_machine);
 
@@ -112,13 +108,6 @@
   return ptx;
 }
 
-OwnedCubin
-GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
-                                                  Location, StringRef) {
-  const char data[] = "CUBIN";
-  return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
-}
-
 OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
                                                       Location loc,
                                                       StringRef name) {
@@ -158,7 +147,3 @@
 mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
   return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
 }
-
-static PassRegistration<GpuKernelToCubinPass>
-    pass("test-kernel-to-cubin",
-         "Convert all kernel functions to CUDA cubin blobs");
diff --git a/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt b/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt
--- a/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_mlir_conversion_library(MLIRLoopToStandard
-  ConvertLoopToStandard.cpp
+  LoopToStandard.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LoopToStandard
diff --git a/mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp b/mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp
rename from mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp
rename to mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp
--- a/mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp
+++ b/mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp
@@ -1,4 +1,4 @@
-//===- ConvertLoopToStandard.cpp - ControlFlow to CFG conversion ----------===//
+//===- LoopToStandard.cpp - ControlFlow to CFG conversion -----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
@@ -24,36 +24,17 @@
 using namespace mlir;
 using namespace mlir::loop;
 
-static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
-static llvm::cl::opt<unsigned>
-    clNumBlockDims("gpu-block-dims",
-                   llvm::cl::desc("Number of GPU block dimensions for mapping"),
-                   llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
-static llvm::cl::opt<unsigned> clNumThreadDims(
-    "gpu-thread-dims",
-    llvm::cl::desc("Number of GPU thread dimensions for mapping"),
-    llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
-
-static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME
-                                                      " options");
-static llvm::cl::list<unsigned>
-    clNumWorkGroups("gpu-num-workgroups",
-                    llvm::cl::desc("Num workgroups in the GPU launch"),
-                    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
-                    llvm::cl::cat(clLoopOpToGPUCategory));
-static llvm::cl::list<unsigned>
-    clWorkGroupSize("gpu-workgroup-size",
-                    llvm::cl::desc("Workgroup Size in the GPU launch"),
-                    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
-                    llvm::cl::cat(clLoopOpToGPUCategory));
-
 namespace {
 // A pass that traverses top-level loops in the function and converts them to
 // GPU launch operations.  Nested launches are not allowed, so this does not
 // walk the function recursively to avoid considering nested loops.
 struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
-  ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims)
-      : numBlockDims(numBlockDims), numThreadDims(numThreadDims) {}
+  ForLoopMapper() = default;
+  ForLoopMapper(const ForLoopMapper &) {}
+  ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) {
+    this->numBlockDims = numBlockDims;
+    this->numThreadDims = numThreadDims;
+  }
 
   void runOnFunction() override {
     for (Block &block : getFunction())
@@ -70,8 +51,14 @@
       }
   }
 
-  unsigned numBlockDims;
-  unsigned numThreadDims;
+  Option<unsigned> numBlockDims{
+      *this, "gpu-block-dims",
+      llvm::cl::desc("Number of GPU block dimensions for mapping"),
+      llvm::cl::init(1u)};
+  Option<unsigned> numThreadDims{
+      *this, "gpu-thread-dims",
+      llvm::cl::desc("Number of GPU thread dimensions for mapping"),
+      llvm::cl::init(1u)};
 };
 
 // A pass that traverses top-level loops in the function and convertes them to
@@ -81,10 +68,13 @@
 // to be perfectly nested upto depth equal to size of `workGroupSize`.
 struct ImperfectlyNestedForLoopMapper
     : public FunctionPass<ImperfectlyNestedForLoopMapper> {
+  ImperfectlyNestedForLoopMapper() = default;
+  ImperfectlyNestedForLoopMapper(const ImperfectlyNestedForLoopMapper &) {}
   ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
-                                 ArrayRef<int64_t> workGroupSize)
-      : numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()),
-        workGroupSize(workGroupSize.begin(), workGroupSize.end()) {}
+                                 ArrayRef<int64_t> workGroupSize) {
+    this->numWorkGroups->assign(numWorkGroups.begin(), numWorkGroups.end());
+    this->workGroupSize->assign(workGroupSize.begin(), workGroupSize.end());
+  }
 
   void runOnFunction() override {
     // Insert the num work groups and workgroup sizes as constant values. This
@@ -113,8 +103,14 @@
       }
     }
   }
-  SmallVector<int64_t, 3> numWorkGroups;
-  SmallVector<int64_t, 3> workGroupSize;
+  ListOption<int64_t> numWorkGroups{
+      *this, "gpu-num-workgroups",
+      llvm::cl::desc("Num workgroups in the GPU launch"), llvm::cl::ZeroOrMore,
+      llvm::cl::MiscFlags::CommaSeparated};
+  ListOption<int64_t> workGroupSize{
+      *this, "gpu-workgroup-size",
+      llvm::cl::desc("Workgroup Size in the GPU launch"), llvm::cl::ZeroOrMore,
+      llvm::cl::MiscFlags::CommaSeparated};
 };
 
 struct ParallelLoopToGpuPass : public OperationPass<ParallelLoopToGpuPass> {
@@ -152,20 +148,11 @@
 }
 
 static PassRegistration<ForLoopMapper>
-    registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] {
-      return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(),
-                                             clNumThreadDims.getValue());
-    });
-
-static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU(
-    LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels",
-    [] {
-      SmallVector<int64_t, 3> numWorkGroups, workGroupSize;
-      numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end());
-      workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end());
-      return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
-                                                              workGroupSize);
-    });
+    registration(PASS_NAME, "Convert top-level loops to GPU kernels");
+
+static PassRegistration<ImperfectlyNestedForLoopMapper>
+    loopOpToGPU(LOOPOP_TO_GPU_PASS_NAME,
+                "Convert top-level loop::ForOp to GPU kernels");
 
 static PassRegistration<ParallelLoopToGpuPass>
     pass("convert-parallel-loops-to-gpu", "Convert mapped loop.parallel ops"
diff --git a/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt b/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
--- a/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_mlir_conversion_library(MLIRStandardToLLVM
-  ConvertStandardToLLVM.cpp
+  StandardToLLVM.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/StandardToLLVM
diff --git a/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
rename from mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
rename to mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
--- a/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -1,4 +1,4 @@
-//===- ConvertStandardToLLVM.cpp - Standard to LLVM dialect conversion-----===//
+//===- StandardToLLVM.cpp - Standard to LLVM dialect conversion -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp
@@ -16,12 +16,12 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -1,6 +1,10 @@
 add_mlir_dialect_library(MLIRAffineTransforms
   AffineDataCopyGeneration.cpp
   AffineLoopInvariantCodeMotion.cpp
+  LoopTiling.cpp
+  LoopUnroll.cpp
+  LoopUnrollAndJam.cpp
+  SuperVectorize.cpp
   SimplifyAffineStructures.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
rename from mlir/lib/Transforms/LoopTiling.cpp
rename to mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
--- a/mlir/lib/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
@@ -15,10 +15,10 @@
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
-#include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -29,19 +29,20 @@
 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
 
 static llvm::cl::opt<unsigned long long>
-    clCacheSizeKiB("tile-cache-size",
+    clCacheSizeKiB("affine-tile-cache-size",
                    llvm::cl::desc("Set size of cache to tile for in KiB"),
                    llvm::cl::cat(clOptionsCategory));
 
 // Tile size to use for all loops (overrides -tile-sizes if provided).
 static llvm::cl::opt<unsigned>
-    clTileSize("tile-size", llvm::cl::desc("Use this tile size for all loops"),
+    clTileSize("affine-tile-size",
+               llvm::cl::desc("Use this tile size for all loops"),
                llvm::cl::cat(clOptionsCategory));
 
 // List of tile sizes. If any of them aren't provided, they are filled with
 // clTileSize / kDefaultTileSize.
 static llvm::cl::list<unsigned> clTileSizes(
-    "tile-sizes",
+    "affine-tile-sizes",
     llvm::cl::desc(
         "List of tile sizes for each perfect nest (overridden by -tile-size)"),
     llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
diff --git a/mlir/lib/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
rename from mlir/lib/Transforms/LoopUnroll.cpp
rename to mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
--- a/mlir/lib/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
@@ -10,10 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Transforms/Passes.h"
-
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
diff --git a/mlir/lib/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
rename from mlir/lib/Transforms/LoopUnrollAndJam.cpp
rename to mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
--- a/mlir/lib/Transforms/LoopUnrollAndJam.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
@@ -32,10 +32,9 @@
 // Note: 'if/else' blocks are not jammed. So, if there are loops inside if
 // op's, bodies of those loops will not be jammed.
 //===----------------------------------------------------------------------===//
-#include "mlir/Transforms/Passes.h"
-
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"
diff --git a/mlir/lib/Transforms/Vectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
rename from mlir/lib/Transforms/Vectorize.cpp
rename to mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
--- a/mlir/lib/Transforms/Vectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1,4 +1,4 @@
-//===- Vectorize.cpp - Vectorize Pass Impl --------------------------------===//
+//===- SuperVectorize.cpp - Vectorize Pass Impl ---------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -16,6 +16,7 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
@@ -27,7 +28,6 @@
 #include "mlir/Support/Functional.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/FoldUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -414,7 +414,7 @@
 ///
 /// The -affine-vectorize pass with the following arguments:
 /// ```
-/// -affine-vectorize -virtual-vector-size 256 --test-fastest-varying=0
+/// -affine-vectorize="virtual-vector-size=256 test-fastest-varying=0"
 /// ```
 ///
 /// produces this standard innermost-loop vectorized code:
@@ -468,8 +468,7 @@
 ///
 /// The -affine-vectorize pass with the following arguments:
 /// ```
-/// -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256
-/// --test-fastest-varying=1 --test-fastest-varying=0
+/// -affine-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0"
 /// ```
 ///
 /// produces this more interesting mixed outer-innermost-loop vectorized code:
@@ -531,21 +530,6 @@
 using llvm::dbgs;
 using llvm::SetVector;
 
-static llvm::cl::OptionCategory clOptionsCategory("vectorize options");
-
-static llvm::cl::list<int> clVirtualVectorSize(
-    "virtual-vector-size",
-    llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
-    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
-
-static llvm::cl::list<int> clFastestVaryingPattern(
-    "test-fastest-varying",
-    llvm::cl::desc(
-        "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory"
-        " dimensions to match. See defaultPatterns in Vectorize.cpp for a"
-        " description and examples. This is used for testing purposes"),
-    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
-
 /// Forward declaration.
 static FilterFunctionType
 isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
@@ -590,33 +574,35 @@
 /// Base state for the vectorize pass.
 /// Command line arguments are preempted by non-empty pass arguments.
 struct Vectorize : public FunctionPass<Vectorize> {
-  Vectorize();
+  Vectorize() = default;
+  Vectorize(const Vectorize &) {}
   Vectorize(ArrayRef<int64_t> virtualVectorSize);
   void runOnFunction() override;
 
-  // The virtual vector size that we vectorize to.
-  SmallVector<int64_t, 4> vectorSizes;
-  // Optionally, the fixed mapping from loop to fastest varying MemRef dimension
-  // for all the MemRefs within a loop pattern:
-  //   the index represents the loop depth, the value represents the k^th
-  //   fastest varying memory dimension.
-  // This is voluntarily restrictive and is meant to precisely target a
-  // particular loop/op pair, for testing purposes.
-  SmallVector<int64_t, 4> fastestVaryingPattern;
+  /// The virtual vector size that we vectorize to.
+  ListOption<int64_t> vectorSizes{
+      *this, "virtual-vector-size",
+      llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated};
+  /// Optionally, the fixed mapping from loop to fastest varying MemRef
+  /// dimension for all the MemRefs within a loop pattern:
+  ///   the index represents the loop depth, the value represents the k^th
+  ///   fastest varying memory dimension.
+  /// This is voluntarily restrictive and is meant to precisely target a
+  /// particular loop/op pair, for testing purposes.
+  ListOption<int64_t> fastestVaryingPattern{
+      *this, "test-fastest-varying",
+      llvm::cl::desc(
+          "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory"
+          " dimensions to match. See defaultPatterns in Vectorize.cpp for a"
+          " description and examples. This is used for testing purposes"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated};
 };
 
 } // end anonymous namespace
 
-Vectorize::Vectorize()
-    : vectorSizes(clVirtualVectorSize.begin(), clVirtualVectorSize.end()),
-      fastestVaryingPattern(clFastestVaryingPattern.begin(),
-                            clFastestVaryingPattern.end()) {}
-
-Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) : Vectorize() {
-  if (!virtualVectorSize.empty()) {
-    this->vectorSizes.assign(virtualVectorSize.begin(),
-                             virtualVectorSize.end());
-  }
+Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) {
+  vectorSizes->assign(virtualVectorSize.begin(), virtualVectorSize.end());
 }
 
 /////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
@@ -1282,10 +1268,10 @@
 }
 
 std::unique_ptr<OpPassBase<FuncOp>>
-mlir::createVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
+mlir::createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
   return std::make_unique<Vectorize>(virtualVectorSize);
 }
 
 static PassRegistration<Vectorize>
-    pass("affine-vectorize",
+    pass("affine-super-vectorize",
          "Vectorize to a target independent n-D vector abstraction");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -55,14 +55,6 @@
 /// More advanced use cases, analyses as well as profitability heuristics are
 /// left for future work.
 
-static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
-static llvm::cl::list<unsigned> clTileSizes(
-    "linalg-fusion-tile-sizes",
-    llvm::cl::desc(
-        "Tile sizes by which to tile linalg operations during linalg fusion"),
-    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
-    llvm::cl::cat(clOptionsCategory));
-
 // Return a cloned version of `op` that operates on `loopRanges`, assumed to be
 // a subset of the original loop ranges of `op`.
 // This is achieved by applying the `loopToOperandRangesMaps` permutation maps
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -43,12 +43,6 @@
 
 #define DEBUG_TYPE "linalg-promotion"
 
-static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
-static llvm::cl::opt<bool> clPromoteDynamic(
-    "test-linalg-promote-dynamic",
-    llvm::cl::desc("Test generation of dynamic promoted buffers"),
-    llvm::cl::cat(clOptionsCategory), llvm::cl::init(false));
-
 static Value allocBuffer(Type elementType, Value size, bool dynamicBuffers) {
   auto *ctx = size.getContext();
   auto width = llvm::divideCeil(elementType.getIntOrFloatBitWidth(), 8);
@@ -238,13 +232,19 @@
 namespace {
 struct LinalgPromotionPass : public FunctionPass<LinalgPromotionPass> {
   LinalgPromotionPass() = default;
-  LinalgPromotionPass(bool dynamicBuffers) : dynamicBuffers(dynamicBuffers) {}
+  LinalgPromotionPass(const LinalgPromotionPass &) {}
+  LinalgPromotionPass(bool dynamicBuffers) {
+    this->dynamicBuffers = dynamicBuffers;
+  }
 
   void runOnFunction() override {
     promoteSubViews(getFunction(), dynamicBuffers);
   }
 
-  bool dynamicBuffers;
+  Option<bool> dynamicBuffers{
+      *this, "test-promote-dynamic",
+      llvm::cl::desc("Test generation of dynamic promoted buffers"),
+      llvm::cl::init(false)};
 };
 } // namespace
 
@@ -254,6 +254,4 @@
 }
 
 static PassRegistration<LinalgPromotionPass>
-    pass("linalg-promote-subviews", "promote subview ops to local buffers", [] {
-      return std::make_unique<LinalgPromotionPass>(clPromoteDynamic);
-    });
+    pass("linalg-promote-subviews", "promote subview ops to local buffers");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -39,13 +39,6 @@
 
 #define DEBUG_TYPE "linalg-tiling"
 
-static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
-static llvm::cl::list<unsigned>
-    clTileSizes("linalg-tile-sizes",
-                llvm::cl::desc("Tile sizes by which to tile linalg operations"),
-                llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
-                llvm::cl::cat(clOptionsCategory));
-
 static bool isZero(Value v) {
   return isa_and_nonnull<ConstantIndexOp>(v.getDefiningOp()) &&
          cast<ConstantIndexOp>(v.getDefiningOp()).getValue() == 0;
@@ -513,15 +506,19 @@
 template <typename LoopTy>
 struct LinalgTilingPass : public FunctionPass<LinalgTilingPass<LoopTy>> {
   LinalgTilingPass() = default;
+  LinalgTilingPass(const LinalgTilingPass &) {}
   LinalgTilingPass(ArrayRef<int64_t> sizes) {
-    this->tileSizes.assign(sizes.begin(), sizes.end());
+    this->tileSizes->assign(sizes.begin(), sizes.end());
   }
 
   void runOnFunction() override {
     tileLinalgOps<LoopTy>(this->getFunction(), tileSizes);
   }
 
-  SmallVector<int64_t, 8> tileSizes;
+  Pass::ListOption<int64_t> tileSizes{
+      *this, "linalg-tile-sizes",
+      llvm::cl::desc("Tile sizes by which to tile linalg operations"),
+      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
 };
 
 } // namespace
@@ -537,17 +534,9 @@
 }
 
 static PassRegistration<LinalgTilingPass<loop::ForOp>>
-    tiling_pass("linalg-tile", "Tile operations in the linalg dialect", [] {
-      auto pass = std::make_unique<LinalgTilingPass<loop::ForOp>>();
-      pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end());
-      return pass;
-    });
+    tiling_pass("linalg-tile", "Tile operations in the linalg dialect");
 
 static PassRegistration<LinalgTilingPass<loop::ParallelOp>>
     tiling_to_parallel_loops(
         "linalg-tile-to-parallel-loops",
-        "Tile operations in the linalg dialect to parallel loops", [] {
-          auto pass = std::make_unique<LinalgTilingPass<loop::ParallelOp>>();
-          pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end());
-          return pass;
-        });
+        "Tile operations in the linalg dialect to parallel loops");
diff --git a/mlir/lib/Support/MlirOptMain.cpp b/mlir/lib/Support/MlirOptMain.cpp
--- a/mlir/lib/Support/MlirOptMain.cpp
+++ b/mlir/lib/Support/MlirOptMain.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Support/MlirOptMain.h"
-#include "mlir/Analysis/Passes.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Location.h"
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -9,15 +9,11 @@
   LoopCoalescing.cpp
   LoopFusion.cpp
   LoopInvariantCodeMotion.cpp
-  LoopTiling.cpp
-  LoopUnrollAndJam.cpp
-  LoopUnroll.cpp
   MemRefDataFlowOpt.cpp
   OpStats.cpp
   PipelineDataTransfer.cpp
   StripDebugInfo.cpp
   SymbolDCE.cpp
-  Vectorize.cpp
   ViewOpGraph.cpp
   ViewRegionGraph.cpp
 
diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir
--- a/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=2,2 -gpu-workgroup-size=32,4 %s | FileCheck %s
+// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=2,2 gpu-workgroup-size=32,4" %s | FileCheck %s
 
 module {
   // arg2 = arg0 * transpose(arg1) ; with intermediate buffer and tile size passed as argument
diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir
--- a/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=4,2,2 -gpu-workgroup-size=32,2,2 %s | FileCheck %s
+// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=4,2,2 gpu-workgroup-size=32,2,2" %s | FileCheck %s
 
 module {
   func @imperfect_3D(%arg0 : memref<?x?x?xf32>, %arg1 : memref<?x?x?xf32>, %arg2 : memref<?x?x?xf32>, %arg3 : memref<?x?x?xf32>, %t1 : index, %t2 : index, %t3 : index, %step1 : index, %step2 : index, %step3 : index) {
@@ -80,4 +80,4 @@
     }
     return
   }
-}
\ No newline at end of file
+}
diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir
--- a/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=4,2,2 -gpu-workgroup-size=32,2,2 %s | FileCheck %s
+// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=4,2,2 gpu-workgroup-size=32,2,2" %s | FileCheck %s
 
 module {
   func @imperfect_3D(%arg0 : memref<?x?x?x?xf32>, %arg1 : memref<?x?x?x?xf32>, %arg2 : memref<?x?x?x?xf32>, %arg3 : memref<?x?x?x?xf32>, %t1 : index, %t2 : index, %t3 : index, %t4 : index, %step1 : index, %step2 : index, %step3 : index, %step4 : index) {
@@ -83,4 +83,4 @@
     }
     return
   }
-}
\ No newline at end of file
+}
diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir
--- a/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-loop-op-to-gpu -gpu-num-workgroups=2,16 -gpu-workgroup-size=32,4 | FileCheck %s
+// RUN: mlir-opt %s -convert-loop-op-to-gpu="gpu-num-workgroups=2,16 gpu-workgroup-size=32,4" | FileCheck %s
 
 module {
   func @fmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
diff --git a/mlir/test/Conversion/LoopsToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/LoopsToGPU/no_blocks_no_threads.mlir
--- a/mlir/test/Conversion/LoopsToGPU/no_blocks_no_threads.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/no_blocks_no_threads.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=0 -gpu-thread-dims=1 %s | FileCheck --check-prefix=CHECK-THREADS %s --dump-input-on-failure
-// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=1 -gpu-thread-dims=0 %s | FileCheck --check-prefix=CHECK-BLOCKS %s --dump-input-on-failure
+// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=0 gpu-thread-dims=1" %s | FileCheck --check-prefix=CHECK-THREADS %s --dump-input-on-failure
+// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=1 gpu-thread-dims=0" %s | FileCheck --check-prefix=CHECK-BLOCKS %s --dump-input-on-failure
 
 // CHECK-THREADS-LABEL: @one_d_loop
 // CHECK-BLOCKS-LABEL: @one_d_loop
diff --git a/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir b/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir
--- a/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=2 -gpu-workgroup-size=32 %s | FileCheck %s
+// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=2 gpu-workgroup-size=32" %s | FileCheck %s
 
 module {
   func @foo(%arg0: memref<?x?xf32>, %arg1 : memref<?x?xf32>, %arg2 : memref<?x?xf32>) {
@@ -23,4 +23,4 @@
      }
     return
   }
-}
\ No newline at end of file
+}
diff --git a/mlir/test/Conversion/LoopsToGPU/step_one.mlir b/mlir/test/Conversion/LoopsToGPU/step_one.mlir
--- a/mlir/test/Conversion/LoopsToGPU/step_one.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/step_one.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=1 -gpu-thread-dims=1 %s | FileCheck --check-prefix=CHECK-11 %s
-// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=2 -gpu-thread-dims=2 %s | FileCheck --check-prefix=CHECK-22 %s
+// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=1 gpu-thread-dims=1" %s | FileCheck --check-prefix=CHECK-11 %s
+// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=2 gpu-thread-dims=2" %s | FileCheck --check-prefix=CHECK-22 %s
 
 // CHECK-11-LABEL: @step_1
 // CHECK-22-LABEL: @step_1
diff --git a/mlir/test/Conversion/LoopsToGPU/step_positive.mlir b/mlir/test/Conversion/LoopsToGPU/step_positive.mlir
--- a/mlir/test/Conversion/LoopsToGPU/step_positive.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/step_positive.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=1 -gpu-thread-dims=1 %s | FileCheck %s
+// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=1 gpu-thread-dims=1" %s | FileCheck %s
 
 // CHECK-LABEL: @step_var
 func @step_var(%A : memref<?x?xf32>, %B : memref<?x?xf32>) {
diff --git a/mlir/test/Transforms/Vectorize/compose_maps.mlir b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
rename from mlir/test/Transforms/Vectorize/compose_maps.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
--- a/mlir/test/Transforms/Vectorize/compose_maps.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-vectorizer-test -compose-maps 2>&1 |  FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorizer-test -compose-maps 2>&1 |  FileCheck %s
 
 // For all these cases, the test traverses the `test_affine_map` ops and
 // composes them in order one-by-one.
diff --git a/mlir/test/Transforms/Vectorize/normalize_maps.mlir b/mlir/test/Dialect/Affine/SuperVectorize/normalize_maps.mlir
rename from mlir/test/Transforms/Vectorize/normalize_maps.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/normalize_maps.mlir
--- a/mlir/test/Transforms/Vectorize/normalize_maps.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/normalize_maps.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-vectorizer-test -normalize-maps |  FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorizer-test -normalize-maps |  FileCheck %s
 
 // CHECK-DAG: #[[ZERO:[a-zA-Z0-9]+]] = affine_map<() -> (0)>
 // CHECK-DAG: #[[ID1:[a-zA-Z0-9]+]] = affine_map<(d0) -> (d0)>
diff --git a/mlir/test/Transforms/Vectorize/vector_utils.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
rename from mlir/test/Transforms/Vectorize/vector_utils.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
--- a/mlir/test/Transforms/Vectorize/vector_utils.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -affine-vectorizer-test -vector-shape-ratio 4 -vector-shape-ratio 8 2>&1 | FileCheck %s
-// RUN: mlir-opt %s -affine-vectorizer-test -vector-shape-ratio 2 -vector-shape-ratio 5 -vector-shape-ratio 2 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8
+// RUN: mlir-opt %s -affine-super-vectorizer-test -vector-shape-ratio 4 -vector-shape-ratio 8 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorizer-test -vector-shape-ratio 2 -vector-shape-ratio 5 -vector-shape-ratio 2 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8
 
 func @vector_add_2d(%arg0: index, %arg1: index) -> f32 {
   // Nothing should be matched in this first block.
diff --git a/mlir/test/Transforms/Vectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
rename from mlir/test/Transforms/Vectorize/vectorize_1d.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
--- a/mlir/test/Transforms/Vectorize/vectorize_1d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 128 --test-fastest-varying=0 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s
 
 // Permutation maps used in vectorization.
 // CHECK: #[[map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
diff --git a/mlir/test/Transforms/Vectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
rename from mlir/test/Transforms/Vectorize/vectorize_2d.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
--- a/mlir/test/Transforms/Vectorize/vectorize_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 4 -virtual-vector-size 8 | FileCheck %s -check-prefix=VECT
-// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=4,8" | FileCheck %s -check-prefix=VECT
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0" | FileCheck %s
 
 // Permutation maps used in vectorization.
 // CHECK-DAG: #[[map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
diff --git a/mlir/test/Transforms/Vectorize/vectorize_3d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir
rename from mlir/test/Transforms/Vectorize/vectorize_3d.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir
--- a/mlir/test/Transforms/Vectorize/vectorize_3d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 64 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,64,256 test-fastest-varying=2,1,0" | FileCheck %s
 
 // Permutation maps used in vectorization.
 // CHECK: #[[map_proj_d0d1d2_d0d1d2:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
diff --git a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir
rename from mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir
--- a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=0 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=2,0" | FileCheck %s
 
 // Permutation maps used in vectorization.
 // CHECK: #[[map_proj_d0d1d2_d0d2:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
diff --git a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir
rename from mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir
--- a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=2 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=0,2" | FileCheck %s
 
 // Permutation maps used in vectorization.
 // CHECK: #[[map_proj_d0d1d2_d2d0:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
diff --git a/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir
rename from mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir
rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir
--- a/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=1 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=0,1" | FileCheck %s
 
 // Permutation maps used in vectorization.
 // CHECK-DAG: #[[map_proj_d0d1d2_d2d1:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Dialect/Affine/affine-data-copy.mlir
rename from mlir/test/Transforms/affine-data-copy.mlir
rename to mlir/test/Dialect/Affine/affine-data-copy.mlir
diff --git a/mlir/test/Transforms/affine-loop-invariant-code-motion.mlir b/mlir/test/Dialect/Affine/affine-loop-invariant-code-motion.mlir
rename from mlir/test/Transforms/affine-loop-invariant-code-motion.mlir
rename to mlir/test/Dialect/Affine/affine-loop-invariant-code-motion.mlir
diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Dialect/Affine/dma-generate.mlir
rename from mlir/test/Transforms/dma-generate.mlir
rename to mlir/test/Dialect/Affine/dma-generate.mlir
diff --git a/mlir/test/Transforms/loop-tiling.mlir b/mlir/test/Dialect/Affine/loop-tiling.mlir
rename from mlir/test/Transforms/loop-tiling.mlir
rename to mlir/test/Dialect/Affine/loop-tiling.mlir
--- a/mlir/test/Transforms/loop-tiling.mlir
+++ b/mlir/test/Dialect/Affine/loop-tiling.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -split-input-file  -affine-loop-tile -tile-size=32 | FileCheck %s
-// RUN: mlir-opt %s -split-input-file -affine-loop-tile -tile-cache-size=512 | FileCheck %s --check-prefix=MODEL
+// RUN: mlir-opt %s -split-input-file  -affine-loop-tile -affine-tile-size=32 | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -affine-loop-tile -affine-tile-cache-size=512 | FileCheck %s --check-prefix=MODEL
 
 // -----
 
diff --git a/mlir/test/Transforms/parallelism-detection.mlir b/mlir/test/Dialect/Affine/parallelism-detection.mlir
rename from mlir/test/Transforms/parallelism-detection.mlir
rename to mlir/test/Dialect/Affine/parallelism-detection.mlir
diff --git a/mlir/test/Transforms/simplify-affine-structures.mlir b/mlir/test/Dialect/Affine/simplify-affine-structures.mlir
rename from mlir/test/Transforms/simplify-affine-structures.mlir
rename to mlir/test/Dialect/Affine/simplify-affine-structures.mlir
diff --git a/mlir/test/Transforms/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
rename from mlir/test/Transforms/slicing-utils.mlir
rename to mlir/test/Dialect/Affine/slicing-utils.mlir
--- a/mlir/test/Transforms/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -affine-vectorizer-test -forward-slicing=true 2>&1 | FileCheck %s --check-prefix=FWD
-// RUN: mlir-opt %s -affine-vectorizer-test -backward-slicing=true 2>&1 | FileCheck %s --check-prefix=BWD
-// RUN: mlir-opt %s -affine-vectorizer-test -slicing=true 2>&1 | FileCheck %s --check-prefix=FWDBWD
+// RUN: mlir-opt %s -affine-super-vectorizer-test -forward-slicing=true 2>&1 | FileCheck %s --check-prefix=FWD
+// RUN: mlir-opt %s -affine-super-vectorizer-test -backward-slicing=true 2>&1 | FileCheck %s --check-prefix=BWD
+// RUN: mlir-opt %s -affine-super-vectorizer-test -slicing=true 2>&1 | FileCheck %s --check-prefix=FWDBWD
 
 ///   1       2      3      4
 ///   |_______|      |______|
diff --git a/mlir/test/Transforms/unroll-jam.mlir b/mlir/test/Dialect/Affine/unroll-jam.mlir
rename from mlir/test/Transforms/unroll-jam.mlir
rename to mlir/test/Dialect/Affine/unroll-jam.mlir
diff --git a/mlir/test/Transforms/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
rename from mlir/test/Transforms/unroll.mlir
rename to mlir/test/Dialect/Affine/unroll.mlir
diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir
--- a/mlir/test/Dialect/Linalg/promote.mlir
+++ b/mlir/test/Dialect/Linalg/promote.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -linalg-promote-subviews | FileCheck %s
-// RUN: mlir-opt %s -linalg-promote-subviews -test-linalg-promote-dynamic | FileCheck %s --check-prefix=DYNAMIC
+// RUN: mlir-opt %s -linalg-promote-subviews="test-promote-dynamic" | FileCheck %s --check-prefix=DYNAMIC
 
 #map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
 #map1 = affine_map<(d0) -> (d0 + 2)>
diff --git a/mlir/test/Dialect/Linalg/tile.mlir b/mlir/test/Dialect/Linalg/tile.mlir
--- a/mlir/test/Dialect/Linalg/tile.mlir
+++ b/mlir/test/Dialect/Linalg/tile.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2 | FileCheck %s -check-prefix=TILE-2
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,2 | FileCheck %s -check-prefix=TILE-02
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,0,2 | FileCheck %s -check-prefix=TILE-002
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 | FileCheck %s -check-prefix=TILE-234
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2" | FileCheck %s -check-prefix=TILE-2
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,2" | FileCheck %s -check-prefix=TILE-02
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,2" | FileCheck %s -check-prefix=TILE-002
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" | FileCheck %s -check-prefix=TILE-234
 
 //   TILE-2-DAG: #[[strided1D:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
 //  TILE-02-DAG: #[[strided1D:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
diff --git a/mlir/test/Dialect/Linalg/tile_conv.mlir b/mlir/test/Dialect/Linalg/tile_conv.mlir
--- a/mlir/test/Dialect/Linalg/tile_conv.mlir
+++ b/mlir/test/Dialect/Linalg/tile_conv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,0,0,4 | FileCheck %s -check-prefix=TILE-23004
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,0,0,4" | FileCheck %s -check-prefix=TILE-23004
 
 // TILE-23004-DAG: #[[D0x30pS0x10:.*]] = affine_map<(d0) -> (d0 * 30)>
 // TILE-23004-DAG: #[[S0x10p90:.*]] = affine_map<()[s0] -> (s0 * 10 + 90)>
diff --git a/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir b/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir
--- a/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir
+++ b/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=10,25 | FileCheck %s -check-prefix=TILE-10n25
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=25,0 | FileCheck %s -check-prefix=TILE-25n0
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,25 | FileCheck %s -check-prefix=TILE-0n25
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=10,25" | FileCheck %s -check-prefix=TILE-10n25
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=25,0" | FileCheck %s -check-prefix=TILE-25n0
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,25" | FileCheck %s -check-prefix=TILE-0n25
 
 #id_1d = affine_map<(i) -> (i)>
 #pointwise_1d_trait = {
diff --git a/mlir/test/Dialect/Linalg/tile_parallel.mlir b/mlir/test/Dialect/Linalg/tile_parallel.mlir
--- a/mlir/test/Dialect/Linalg/tile_parallel.mlir
+++ b/mlir/test/Dialect/Linalg/tile_parallel.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -linalg-tile-to-parallel-loops -linalg-tile-sizes=2 | FileCheck %s -check-prefix=TILE-2 --dump-input-on-failure
-// RUN: mlir-opt %s -linalg-tile-to-parallel-loops -linalg-tile-sizes=0,2 | FileCheck %s -check-prefix=TILE-02 --dump-input-on-failure
-// RUN: mlir-opt %s -linalg-tile-to-parallel-loops -linalg-tile-sizes=0,0,2 | FileCheck %s -check-prefix=TILE-002 --dump-input-on-failure
-// RUN: mlir-opt %s -linalg-tile-to-parallel-loops -linalg-tile-sizes=2,3,4 | FileCheck %s -check-prefix=TILE-234 --dump-input-on-failure
+// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=2" | FileCheck %s -check-prefix=TILE-2 --dump-input-on-failure
+// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=0,2" | FileCheck %s -check-prefix=TILE-02 --dump-input-on-failure
+// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=0,0,2" | FileCheck %s -check-prefix=TILE-002 --dump-input-on-failure
+// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=2,3,4" | FileCheck %s -check-prefix=TILE-234 --dump-input-on-failure
 
 #id_2d = affine_map<(i, j) -> (i, j)>
 #pointwise_2d_trait = {
diff --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
--- a/mlir/test/lib/Dialect/Affine/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_llvm_library(MLIRAffineTransformsTestPasses
   TestAffineDataCopy.cpp
+  TestParallelismDetection.cpp
+  TestVectorizationUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
diff --git a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
--- a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Pass/Pass.h"
diff --git a/mlir/test/lib/Transforms/TestParallelismDetection.cpp b/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp
rename from mlir/test/lib/Transforms/TestParallelismDetection.cpp
rename to mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp
--- a/mlir/test/lib/Transforms/TestParallelismDetection.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/Builders.h"
@@ -27,7 +26,6 @@
 
 } // end anonymous namespace
 
-
 // Walks the function and emits a note for all 'affine.for' ops detected as
 // parallel.
 void TestParallelismDetection::runOnFunction() {
diff --git a/mlir/test/lib/Transforms/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
rename from mlir/test/lib/Transforms/TestVectorizationUtils.cpp
rename to mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
--- a/mlir/test/lib/Transforms/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "affine-vectorizer-test"
+#define DEBUG_TYPE "affine-super-vectorizer-test"
 
 using namespace mlir;
 
@@ -284,6 +284,7 @@
 namespace mlir {
 void registerVectorizerTestPass() {
   PassRegistration<VectorizerTestPass> pass(
-      "affine-vectorizer-test", "Tests vectorizer standalone functionality.");
+      "affine-super-vectorizer-test",
+      "Tests vectorizer standalone functionality.");
 }
 } // namespace mlir
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@
   TestAllReduceLowering.cpp
   TestCallGraph.cpp
   TestConstantFold.cpp
+  TestConvertGPUKernelToCubin.cpp
   TestLoopFusion.cpp
   TestGpuMemoryPromotion.cpp
   TestGpuParallelLoopMapping.cpp
@@ -14,10 +15,8 @@
   TestMemRefBoundCheck.cpp
   TestMemRefDependenceCheck.cpp
   TestMemRefStrideCalculation.cpp
-  TestParallelismDetection.cpp
   TestVectorToLoopsConversion.cpp
   TestVectorTransforms.cpp
-  TestVectorizationUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
@@ -39,6 +38,7 @@
   MLIRAnalysis
   MLIREDSC
   MLIRGPU
+  MLIRGPUtoCUDATransforms
   MLIRLinalgOps
   MLIRLinalgTransforms
   MLIRLoopOps
diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp
@@ -0,0 +1,34 @@
+//===- TestConvertGPUKernelToCubin.cpp - Test gpu kernel cubin lowering ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+
+using namespace mlir;
+
+namespace {
+static OwnedCubin compilePtxToCubinForTesting(const std::string &, Location,
+                                              StringRef) {
+  const char data[] = "CUBIN";
+  return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
+}
+} // end anonymous namespace
+
+#if MLIR_CUDA_CONVERSIONS_ENABLED
+namespace mlir {
+void registerTestConvertGPUKernelToCubinPass() {
+  PassPipelineRegistration<>("test-kernel-to-cubin",
+                             "Convert all kernel functions to CUDA cubin blobs",
+                             [](OpPassManager &pm) {
+                               pm.addPass(createConvertGPUKernelToCubinPass(
+                                   compilePtxToCubinForTesting));
+                             });
+}
+} // namespace mlir
+#endif
diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp
--- a/mlir/test/lib/Transforms/TestLoopFusion.cpp
+++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp
@@ -12,7 +12,6 @@
 
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
-#include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
diff --git a/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp b/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp
--- a/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp
+++ b/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp
@@ -14,7 +14,6 @@
 #include "mlir/ADT/TypeSwitch.h"
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
-#include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -35,10 +34,6 @@
 
 } // end anonymous namespace
 
-std::unique_ptr<OpPassBase<FuncOp>> mlir::createTestMemRefBoundCheckPass() {
-  return std::make_unique<TestMemRefBoundCheck>();
-}
-
 void TestMemRefBoundCheck::runOnFunction() {
   getFunction().walk([](Operation *opInst) {
     TypeSwitch<Operation *>(opInst).Case<AffineLoadOp, AffineStoreOp>(
diff --git a/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp b/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp
--- a/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp
+++ b/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp
@@ -12,7 +12,6 @@
 
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
-#include "mlir/Analysis/Passes.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -36,11 +35,6 @@
 
 } // end anonymous namespace
 
-std::unique_ptr<OpPassBase<FuncOp>>
-mlir::createTestMemRefDependenceCheckPass() {
-  return std::make_unique<TestMemRefDependenceCheck>();
-}
-
 // Returns a result string which represents the direction vector (if there was
 // a dependence), returns the string "false" otherwise.
 static std::string
diff --git a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir
--- a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir
+++ b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir
@@ -2,8 +2,8 @@
 // RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e dot -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
 // RUN: mlir-opt %s -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
 // RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
-// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" -linalg-promote-subviews -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" -linalg-promote-subviews -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
 
 #strided1D = affine_map<(d0) -> (d0)>
 #strided2D = affine_map<(d0, d1)[s0] -> (d0 * s0 + d1)>
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Analysis/Passes.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
 #include "mlir/IR/Dialect.h"
@@ -42,6 +41,7 @@
 void registerTestAllReduceLoweringPass();
 void registerTestCallGraphPass();
 void registerTestConstantFold();
+void registerTestConvertGPUKernelToCubinPass();
 void registerTestFunc();
 void registerTestGpuMemoryPromotionPass();
 void registerTestLinalgTransforms();
@@ -97,6 +97,9 @@
   registerTestAllReduceLoweringPass();
   registerTestCallGraphPass();
   registerTestConstantFold();
+#if MLIR_CUDA_CONVERSIONS_ENABLED
+  registerTestConvertGPUKernelToCubinPass();
+#endif
   registerTestFunc();
   registerTestGpuMemoryPromotionPass();
   registerTestLinalgTransforms();
@@ -112,14 +115,6 @@
   registerTestVectorConversions();
   registerTestVectorToLoopsPass();
   registerVectorizerTestPass();
-
-  // The following passes are using global initializers, just link them in.
-  if (std::getenv("bar") != (char *)-1)
-    return;
-
-  // TODO: move these to the test folder.
-  createTestMemRefBoundCheckPass();
-  createTestMemRefDependenceCheckPass();
 }
 
 static cl::opt<bool>
diff --git a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h
--- a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h
+++ b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h
@@ -13,7 +13,6 @@
 #ifndef VULKAN_RUNTIME_H
 #define VULKAN_RUNTIME_H
 
-#include "mlir/Analysis/Passes.h"
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Serialization.h"
 #include "mlir/IR/Module.h"
diff --git a/openmp/README.rst b/openmp/README.rst
--- a/openmp/README.rst
+++ b/openmp/README.rst
@@ -4,7 +4,7 @@
 This repository requires `CMake <http://www.cmake.org/>`_ v2.8.0 or later.  LLVM
 and Clang need a more recent version which also applies for in-tree builds.  For
 more information than available in this document please see
-`LLVM's CMake documentation <http://llvm.org/docs/CMake.html>`_ and the
+`LLVM's CMake documentation <https://llvm.org/docs/CMake.html>`_ and the
 `official documentation <https://cmake.org/cmake/help/v2.8.0/cmake.html>`_.
 
 .. contents::
diff --git a/polly/docs/TipsAndTricks.rst b/polly/docs/TipsAndTricks.rst
--- a/polly/docs/TipsAndTricks.rst
+++ b/polly/docs/TipsAndTricks.rst
@@ -21,7 +21,7 @@
 
     - ``$ bugpoint crash.ll -polly-codegen -opt-args  -polly-canonicalize -polly-process-unprofitable``
 
-    For more documentation on bugpoint, `Visit the LLVM manual <http://llvm.org/docs/Bugpoint.html>`_
+    For more documentation on bugpoint, `Visit the LLVM manual <https://llvm.org/docs/Bugpoint.html>`_
 
 
 Understanding which pass makes a particular change