diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -37,13 +37,34 @@
 // multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
 // could use this pass (with some modifications), but currently it implements
 // its own pass to do something similar to what we do here.
+//
+// Overview of the algorithm and terminology in this pass:
+//
+//  - Break up each basic block into pseudo-BBs, composed of instructions which
+//    are guaranteed to transfer control to their successors.
+//  - Within a single pseudo-BB, find all loads, and group them into
+//    "equivalence classes" according to getUnderlyingObject() and loaded
+//    element size.  Do the same for stores.
+//  - For each equivalence class, greedily build "chains".  Each chain has a
+//    leader instruction, and every other member of the chain has a known
+//    constant offset from the first instr in the chain.
+//  - Break up chains so that they contain only contiguous accesses of legal
+//    size with no intervening may-alias instrs.
+//  - Convert each chain to vector instructions.
+//
+// The O(n^2) behavior of this pass comes from initially building the chains.
+// In the worst case we have to compare each new instruction to all of those
+// that came before. To limit this, we only calculate the offset to the leaders
+// of the N most recently-used chains.
 
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -57,6 +78,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -67,22 +89,33 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ModRef.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdint>
 #include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <optional>
 #include <tuple>
+#include <type_traits>
 #include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -91,21 +124,114 @@
 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
 
+namespace {
+
+// Equivalence class key, the initial tuple by which we group loads/stores.
+// Loads/stores with different EqClassKeys are never merged.
+//
+// (We could in theory remove element-size from the this tuple.  We'd just need
+// to fix up the vector packing/unpacking code.)
+using EqClassKey =
+    std::tuple<const Value * /* result of getUnderlyingObject() */,
+               unsigned /* AddrSpace */,
+               unsigned /* Load/Store element size bits */,
+               char /* IsLoad; char b/c bool can't be a DenseMap key */
+               >;
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const EqClassKey &K) {
+  const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
+  return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
+            << " of element size " << ElementSize << " bits in addrspace "
+            << AddrSpace;
+}
+
+// A Chain is a set of instructions such that:
+//  - All instructions have the same equivalence class, so in particular all are
+//    loads, or all are stores.
+//  - We know the address accessed by the i'th chain elem relative to the
+//    chain's leader instruction, which is the first instr of the chain in BB
+//    order.
+//
+// Chains have two canonical orderings:
+//  - BB order, sorted by Instr->comesBefore.
+//  - Offset order, sorted by OffsetFromLeader.
+// This pass switches back and forth between these orders.
+struct ChainElem {
+  Instruction *Inst;
+  APInt OffsetFromLeader;
+};
+using Chain = SmallVector<ChainElem, 1>;
+
+void sortChainInBBOrder(Chain &C) {
+  sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
+}
+
+void sortChainInOffsetOrder(Chain &C) {
+  sort(C, [](const auto &A, const auto &B) {
+    if (A.OffsetFromLeader != B.OffsetFromLeader)
+      return A.OffsetFromLeader.slt(B.OffsetFromLeader);
+    return A.Inst->comesBefore(B.Inst); // stable tiebreaker
+  });
+}
+
+void dumpChain(ArrayRef<ChainElem> C) {
+  for (const auto &E : C) {
+    dbgs() << "  " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
+  }
+}
+
+using EquivalenceClassMap =
+    MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
+
 // FIXME: Assuming stack alignment of 4 is always good enough
-static const unsigned StackAdjustedAlignment = 4;
+constexpr unsigned StackAdjustedAlignment = 4;
 
-namespace {
+Instruction *propagateMetadata(Instruction *I, const Chain &C) {
+  SmallVector<Value *, 8> Values;
+  for (const ChainElem &E : C)
+    Values.push_back(E.Inst);
+  return propagateMetadata(I, Values);
+}
+
+bool isInvariantLoad(const Instruction *I) {
+  const LoadInst *LI = dyn_cast<LoadInst>(I);
+  return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
+}
+
+/// Reorders the instructions that I depends on (the instructions defining its
+/// operands), to ensure they dominate I.
+void reorder(Instruction *I) {
+  SmallPtrSet<Instruction *, 16> InstructionsToMove;
+  SmallVector<Instruction *, 16> Worklist;
+
+  Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *IW = Worklist.pop_back_val();
+    int NumOperands = IW->getNumOperands();
+    for (int i = 0; i < NumOperands; i++) {
+      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
+      if (!IM || IM->getOpcode() == Instruction::PHI)
+        continue;
+
+      // If IM is in another BB, no need to move it, because this pass only
+      // vectorizes instructions within one BB.
+      if (IM->getParent() != I->getParent())
+        continue;
+
+      if (!IM->comesBefore(I)) {
+        InstructionsToMove.insert(IM);
+        Worklist.push_back(IM);
+      }
+    }
+  }
 
-/// ChainID is an arbitrary token that is allowed to be different only for the
-/// accesses that are guaranteed to be considered non-consecutive by
-/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
-/// together and reducing the number of instructions the main search operates on
-/// at a time, i.e. this is to reduce compile time and nothing else as the main
-/// search has O(n^2) time complexity. The underlying type of ChainID should not
-/// be relied upon.
-using ChainID = const Value *;
-using InstrList = SmallVector<Instruction *, 8>;
-using InstrListMap = MapVector<ChainID, InstrList>;
+  // All instructions to move should follow I. Start from I, not from begin().
+  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
+    Instruction *IM = &*(BBI++);
+    if (!InstructionsToMove.count(IM))
+      continue;
+    IM->moveBefore(I);
+  }
+}
 
 class Vectorizer {
   Function &F;
@@ -117,6 +243,12 @@
   const DataLayout &DL;
   IRBuilder<> Builder;
 
+  // We could erase instrs right after vectorizing them, but that can mess up
+  // our BB iterators, and also can make the equivalence class keys point to
+  // freed memory.  This is fixable, but it's simpler just to wait until we're
+  // done with the BB and erase all at once.
+  SmallVector<Instruction *, 128> ToErase;
+
 public:
   Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
              DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
@@ -126,70 +258,80 @@
   bool run();
 
 private:
-  unsigned getPointerAddressSpace(Value *I);
-
   static const unsigned MaxDepth = 3;
 
-  bool isConsecutiveAccess(Value *A, Value *B);
-  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
-                              unsigned Depth = 0) const;
-  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
-                                   unsigned Depth) const;
-  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
-                          unsigned Depth) const;
-
-  /// After vectorization, reorder the instructions that I depends on
-  /// (the instructions defining its operands), to ensure they dominate I.
-  void reorder(Instruction *I);
-
-  /// Returns the first and the last instructions in Chain.
-  std::pair<BasicBlock::iterator, BasicBlock::iterator>
-  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
-
-  /// Erases the original instructions after vectorizing.
-  void eraseInstructions(ArrayRef<Instruction *> Chain);
-
-  /// "Legalize" the vector type that would be produced by combining \p
-  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
-  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
-  /// expected to have more than 4 elements.
-  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
-  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
-
-  /// Finds the largest prefix of Chain that's vectorizable, checking for
-  /// intervening instructions which may affect the memory accessed by the
-  /// instructions within Chain.
+  /// Runs the vectorizer on a "pseudo basic block", which is a range of
+  /// instructions [Begin, End) within one BB all of which have
+  /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
+  bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
+
+  /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
+  /// in the same BB with the same value for getUnderlyingObject() etc.
+  bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
+                             ArrayRef<Instruction *> EqClass);
+
+  /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
+  /// where all instructions access a known, constant offset from the first
+  /// instruction.
+  bool runOnChain(Chain &C);
+
+  /// Splits the chain into subchains of instructions which read/write a
+  /// contiguous block of memory.  Discards any length-1 subchains (because
+  /// there's nothing to vectorize in there).
+  std::vector<Chain> splitChainByContiguity(Chain &C);
+
+  /// Splits the chain into subchains where it's safe to hoist loads up to the
+  /// beginning of the sub-chain and it's safe to sink loads up to the end of
+  /// the sub-chain.  Discards any length-1 subchains.
+  std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
+
+  /// Splits the chain into subchains that make legal, aligned accesses.
+  /// Discards any length-1 subchains.
+  std::vector<Chain> splitChainByAlignment(Chain &C);
+
+  /// Converts the instrs in the chain into a single vectorized load or store.
+  /// Adds the old scalar loads/stores to ToErase.
+  bool vectorizeChain(Chain &C);
+
+  /// Tries to compute the offset in bytes PtrB - PtrA.
+  std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
+                                         unsigned Depth = 0);
+  std::optional<APInt> gtConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
+                                                    unsigned Depth);
+  std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
+                                                unsigned Depth);
+
+  /// Gets the element type of the vector that the chain will load or store.
+  /// This is nontrivial because the chain may contain elements of different
+  /// types; e.g. it's legal to have a chain that contains both i32 and float.
+  Type *getChainElemTy(const Chain &C);
+
+  /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
+  /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
+  /// instructions.
+  ///
+  /// The map ChainElemOffsets must contain all of the elements in
+  /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
+  /// address.  It's ok if it contains additional entries.
+  template <bool IsLoadChain>
+  bool isSafeToMove(
+      Instruction *ChainElem, Instruction *ChainBegin,
+      const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
+
+  /// Collects loads and stores grouped by "equivalence class", where:
+  ///   - all elements in an eq class are a load or all are a store,
+  ///   - they all load/store the same element size (it's OK to have e.g. i8 and
+  ///     <4 x i8> in the same class, but not i32 and <4 x i8>), and
+  ///   - they all have the same value for getUnderlyingObject().
+  EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
+                                                BasicBlock::iterator End);
+
+  /// Partitions Instrs into "chains" where every instruction has a known
+  /// constant offset from the first instr in the chain.
   ///
-  /// The elements of \p Chain must be all loads or all stores and must be in
-  /// address order.
-  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
-
-  /// Collects load and store instructions to vectorize.
-  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
-
-  /// Processes the collected instructions, the \p Map. The values of \p Map
-  /// should be all loads or all stores.
-  bool vectorizeChains(InstrListMap &Map);
-
-  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
-  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
-
-  /// Vectorizes the load instructions in Chain.
-  bool
-  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
-                     SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
-
-  /// Vectorizes the store instructions in Chain.
-  bool
-  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
-                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
-
-  /// Check if this load/store access is misaligned accesses.
-  /// Returns a \p RelativeSpeed of an operation if allowed suitable to
-  /// compare to another result for the same \p AddressSpace and potentially
-  /// different \p Alignment and \p SzInBytes.
-  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
-                          Align Alignment, unsigned &RelativeSpeed);
+  /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
+  /// in the chain is the leader, and an instr touches distance 0 from itself.
+  std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
 };
 
 class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -197,7 +339,8 @@
   static char ID;
 
   LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
-    initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
+    initializeLoadStoreVectorizerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &F) override;
@@ -249,11 +392,11 @@
   AssumptionCache &AC =
       getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
-  Vectorizer V(F, AA, AC, DT, SE, TTI);
-  return V.run();
+  return Vectorizer(F, AA, AC, DT, SE, TTI).run();
 }
 
-PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
   // Don't vectorize when the attribute NoImplicitFloat is used.
   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
     return PreservedAnalyses::all();
@@ -264,125 +407,678 @@
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
 
-  Vectorizer V(F, AA, AC, DT, SE, TTI);
-  bool Changed = V.run();
+  bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   return Changed ? PA : PreservedAnalyses::all();
 }
 
-// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
-// vectors of Instructions.
-static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
-  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
-  propagateMetadata(I, VL);
-}
-
-// Vectorizer Implementation
 bool Vectorizer::run() {
   bool Changed = false;
-
-  // Scan the blocks in the function in post order.
+  // Break up the BB if there are any instrs which aren't guaranteed to transfer
+  // execution to their successor.
+  //
+  // Consider, for example:
+  //
+  //   def assert_arr_len(int n) { if (n < 2) exit(); }
+  //
+  //   load arr[0]
+  //   call assert_array_len(arr.length)
+  //   load arr[1]
+  //
+  // Even though assert_arr_len does not read or write any memory, we can't
+  // speculate the second load before the call.  More info at
+  // https://github.com/llvm/llvm-project/issues/52950.
   for (BasicBlock *BB : post_order(&F)) {
-    InstrListMap LoadRefs, StoreRefs;
-    std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
-    Changed |= vectorizeChains(LoadRefs);
-    Changed |= vectorizeChains(StoreRefs);
+    // BB must at least have a terminator.
+    assert(!BB->empty());
+
+    SmallVector<BasicBlock::iterator, 8> Barriers;
+    Barriers.push_back(BB->begin());
+    for (Instruction &I : *BB)
+      if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+        Barriers.push_back(I.getIterator());
+    Barriers.push_back(BB->end());
+
+    for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
+         ++It)
+      Changed |= runOnPseudoBB(*It, *std::next(It));
+
+    for (Instruction *I : ToErase) {
+      auto *PtrOperand = getLoadStorePointerOperand(I);
+      if (I->use_empty())
+        I->eraseFromParent();
+      RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
+    }
+    ToErase.clear();
   }
 
   return Changed;
 }
 
-unsigned Vectorizer::getPointerAddressSpace(Value *I) {
-  if (LoadInst *L = dyn_cast<LoadInst>(I))
-    return L->getPointerAddressSpace();
-  if (StoreInst *S = dyn_cast<StoreInst>(I))
-    return S->getPointerAddressSpace();
-  return -1;
+bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
+                               BasicBlock::iterator End) {
+  LLVM_DEBUG({
+    dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
+    if (End != Begin->getParent()->end())
+      dbgs() << *End;
+    else
+      dbgs() << "<BB end>";
+    dbgs() << ")\n";
+  });
+
+  bool Changed = false;
+  for (const auto &[EqClassKey, EqClass] :
+       collectEquivalenceClasses(Begin, End))
+    Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
+
+  return Changed;
 }
 
-// FIXME: Merge with llvm::isConsecutiveAccess
-bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
-  Value *PtrA = getLoadStorePointerOperand(A);
-  Value *PtrB = getLoadStorePointerOperand(B);
-  unsigned ASA = getPointerAddressSpace(A);
-  unsigned ASB = getPointerAddressSpace(B);
+bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
+                                       ArrayRef<Instruction *> EqClass) {
+  bool Changed = false;
 
-  // Check that the address spaces match and that the pointers are valid.
-  if (!PtrA || !PtrB || (ASA != ASB))
-    return false;
+  LLVM_DEBUG({
+    dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
+           << " keyed on " << EqClassKey << ":\n";
+    for (Instruction *I : EqClass)
+      dbgs() << "  " << *I << "\n";
+  });
 
-  // Make sure that A and B are different pointers of the same size type.
-  Type *PtrATy = getLoadStoreType(A);
-  Type *PtrBTy = getLoadStoreType(B);
-  if (PtrA == PtrB ||
-      PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
-      DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
-      DL.getTypeStoreSize(PtrATy->getScalarType()) !=
-          DL.getTypeStoreSize(PtrBTy->getScalarType()))
-    return false;
+  std::vector<Chain> Chains = gatherChains(EqClass);
+  LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
+                    << " nontrivial chains.\n";);
+  for (Chain &C : Chains)
+    Changed |= runOnChain(C);
+  return Changed;
+}
 
-  unsigned PtrOffsetWidth = DL.getIndexSizeInBits(ASA);
-  APInt Size(PtrOffsetWidth, DL.getTypeStoreSize(PtrATy));
+bool Vectorizer::runOnChain(Chain &C) {
+  LLVM_DEBUG({
+    dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
+    dumpChain(C);
+  });
 
-  return areConsecutivePointers(PtrA, PtrB, Size);
+  // Split up the chain into increasingly smaller chains, until we can finally
+  // vectorize the chains.
+  //
+  // (Don't be scared by the depth of the loop nest here.  These operations are
+  // all at worst O(n lg n) in the number of instructions, and splitting chains
+  // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
+  bool Changed = false;
+  for (auto &C : splitChainByMayAliasInstrs(C))
+    for (auto &C : splitChainByContiguity(C))
+      for (auto &C : splitChainByAlignment(C))
+        Changed |= vectorizeChain(C);
+  return Changed;
 }
 
-bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
-                                        APInt PtrDelta, unsigned Depth) const {
-  unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
-  APInt OffsetA(OffsetBitWidth, 0);
-  APInt OffsetB(OffsetBitWidth, 0);
-  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
-  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
+  if (C.empty())
+    return {};
 
-  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+  sortChainInBBOrder(C);
 
-  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+  LLVM_DEBUG({
+    dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
+    dumpChain(C);
+  });
+
+  // We know that elements in the chain with nonverlapping offsets can't
+  // alias, but AA may not be smart enough to figure this out.  Use a
+  // hashtable.
+  DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
+  for (const auto &E : C)
+    ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
+
+  // Loads get hoisted up to the first load in the chain.  Stores get sunk
+  // down to the last store in the chain.  Our algorithm for loads is:
+  //
+  //  - Take the first element of the chain.  This is the start of a new chain.
+  //  - Take the next element of `Chain` and check for may-alias instructions
+  //    up to the start of NewChain.  If no may-alias instrs, add it to
+  //    NewChain.  Otherwise, start a new NewChain.
+  //
+  // For stores it's the same except in the reverse direction.
+  //
+  // We expect IsLoad to be an std::bool_constant.
+  auto Impl = [&](auto IsLoad) {
+    // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
+    auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
+      if constexpr (IsLoad())
+        return std::make_pair(C.begin(), C.end());
+      else
+        return std::make_pair(C.rbegin(), C.rend());
+    }(IsLoad);
+    assert(ChainBegin != ChainEnd);
+
+    std::vector<Chain> Chains;
+    SmallVector<ChainElem, 1> NewChain;
+    NewChain.push_back(*ChainBegin);
+    for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
+      if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
+                               ChainOffsets)) {
+        LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
+                          << *ChainIt->Inst << " into " << *ChainBegin->Inst
+                          << "\n");
+        NewChain.push_back(*ChainIt);
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
+                   << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
+        if (NewChain.size() > 1) {
+          LLVM_DEBUG({
+            dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
+            dumpChain(NewChain);
+          });
+          Chains.push_back(std::move(NewChain));
+        }
+
+        // Start a new chain.
+        NewChain = SmallVector<ChainElem, 1>({*ChainIt});
+      }
+    }
+    if (NewChain.size() > 1) {
+      LLVM_DEBUG({
+        dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
+        dumpChain(NewChain);
+      });
+      Chains.push_back(std::move(NewChain));
+    }
+    return Chains;
+  };
+
+  if (isa<LoadInst>(C[0].Inst))
+    return Impl(/*IsLoad=*/std::bool_constant<true>());
+
+  assert(isa<StoreInst>(C[0].Inst));
+  return Impl(/*IsLoad=*/std::bool_constant<false>());
+}
+
+std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
+  if (C.empty())
+    return {};
+
+  sortChainInOffsetOrder(C);
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: splitChainByContiguity considering chain:\n";
+    dumpChain(C);
+  });
+
+  std::vector<Chain> Ret;
+  Ret.push_back({C.front()});
+
+  for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
+    // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
+    auto &CurChain = Ret.back();
+    const ChainElem &Prev = CurChain.back();
+    unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
+    assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
+                              "collectEquivalenceClass");
+    APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
+
+    // Add this instruction to the end of the current chain, or start a new one.
+    bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
+    LLVM_DEBUG(dbgs() << "LSV: Instructions are "
+                      << (AreContiguous ? "" : "not ") << "contiguous: "
+                      << *Prev.Inst << " (ends at offset " << PrevReadEnd
+                      << ") -> " << *It->Inst << " (starts at offset "
+                      << It->OffsetFromLeader << ")\n");
+    if (AreContiguous)
+      CurChain.push_back(*It);
+    else
+      Ret.push_back({*It});
+  }
+
+  // Filter out length-1 chains, these are uninteresting.
+  llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
+  return Ret;
+}
+
+Type *Vectorizer::getChainElemTy(const Chain &C) {
+  assert(!C.empty());
+  // The rules are:
+  //  - If there are any pointer types in the chain, use an integer type.
+  //  - Prefer an integer type if it appears in the chain.
+  //  - Otherwise, use the first type in the chain.
+  //
+  // The rule about pointer types is a simplification when we merge e.g.  a load
+  // of a ptr and a double.  There's no direct conversion from a ptr to a
+  // double; it requires a ptrtoint followed by a bitcast.
+  //
+  // It's unclear to me if the other rules have any practical effect, but we do
+  // it to match this pass's previous behavior.
+  if (any_of(C, [](const ChainElem &E) {
+        return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
+      })) {
+    return Type::getIntNTy(
+        F.getContext(),
+        DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
+  }
+
+  for (const ChainElem &E : C)
+    if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
+      return T;
+  return getLoadStoreType(C[0].Inst)->getScalarType();
+}
+
+std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
+  // We use a simple greedy algorithm.
+  //  - Given a chain of length N, find all prefixes that
+  //    (a) are not longer than the max register length, and
+  //    (b) are a power of 2.
+  //  - Starting from the longest prefix, try to create a vector of that length.
+  //  - If one of them works, great.  Repeat the algorithm on any remaining
+  //    elements in the chain.
+  //  - If none of them work, discard the first element and repeat on a chain
+  //    of length N-1.
+  if (C.empty())
+    return {};
+
+  sortChainInOffsetOrder(C);
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: splitChainByAlignment considering chain:\n";
+    dumpChain(C);
+  });
+
+  bool IsLoadChain = isa<LoadInst>(C[0].Inst);
+  auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
+                             unsigned ChainSizeBytes, VectorType *VecTy) {
+    return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
+                                                 ChainSizeBytes, VecTy)
+                       : TTI.getStoreVectorFactor(VF, LoadStoreSize,
+                                                  ChainSizeBytes, VecTy);
+  };
+
+#ifndef NDEBUG
+  for (const auto &E : C) {
+    Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
+    assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
+           "Should have filtered out non-power-of-two elements in "
+           "collectEquivalenceClasses.");
+  }
+#endif
+
+  unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
+  unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
+
+  std::vector<Chain> Ret;
+  for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
+    // Find candidate chains of size not greater than the largest vector reg.
+    // These chains are over the closed interval [CBegin, CEnd].
+    SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
+        CandidateChains;
+    for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
+      APInt Sz = C[CEnd].OffsetFromLeader +
+                 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
+                 C[CBegin].OffsetFromLeader;
+      if (Sz.sgt(VecRegBytes))
+        break;
+      CandidateChains.push_back(
+          {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
+    }
+
+    // Consider the longest chain first.
+    for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
+         It != End; ++It) {
+      auto [CEnd, SizeBytes] = *It;
+      LLVM_DEBUG(
+          dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
+                 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
+
+      Type *VecElemTy = getChainElemTy(C);
+      // Note, VecElemTy is a power of 2, but might be less than one byte.  For
+      // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
+      // VecElemTy would be i4.
+      unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
+
+      // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
+      assert((8 * SizeBytes) % VecElemBits == 0);
+      unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
+      FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
+      unsigned VF = 8 * VecRegBytes / VecElemBits;
+
+      // Check that TTI is happy with this vectorization factor.
+      unsigned TargetVF = getVectorFactor(VF, VecElemBits,
+                                          VecElemBits * NumVecElems / 8, VecTy);
+      if (TargetVF != VF && TargetVF < NumVecElems) {
+        LLVM_DEBUG(
+            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+                      "because TargetVF="
+                   << TargetVF << " != VF=" << VF
+                   << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
+        continue;
+      }
+
+      // Is a load/store with this alignment allowed by TTI and at least as fast
+      // as an unvectorized load/store?
+      //
+      // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
+      auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
+                               &F = F](Align Alignment) {
+        if (Alignment.value() % SizeBytes == 0)
+          return true;
+        unsigned VectorizedSpeed = 0;
+        bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
+            F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
+        if (!AllowsMisaligned) {
+          LLVM_DEBUG(dbgs()
+                     << "LSV: Access of " << SizeBytes << "B in addrspace "
+                     << AS << " with alignment " << Alignment.value()
+                     << " is misaligned, and therefore can't be vectorized.\n");
+          return false;
+        }
+
+        unsigned ElementwiseSpeed = 0;
+        (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
+                                             Alignment, &ElementwiseSpeed);
+        if (VectorizedSpeed < ElementwiseSpeed) {
+          LLVM_DEBUG(dbgs()
+                     << "LSV: Access of " << SizeBytes << "B in addrspace "
+                     << AS << " with alignment " << Alignment.value()
+                     << " has relative speed " << VectorizedSpeed
+                     << ", which is lower than the elementwise speed of "
+                     << ElementwiseSpeed
+                     << ".  Therefore this access won't be vectorized.\n");
+          return false;
+        }
+        return true;
+      };
+
+      // If we're loading/storing from an alloca, align it if possible.
+      //
+      // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
+      // tells us this is beneficial.  This feels a bit odd, but it matches
+      // existing tests.  This isn't *so* bad, because at most we align to 4
+      // bytes (current value of StackAdjustedAlignment).
+      //
+      // FIXME: We will upgrade the alignment of the alloca even if it turns out
+      // we can't vectorize for some other reason.
+      Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
+      if (AS == DL.getAllocaAddrSpace() && Alignment.value() % SizeBytes != 0 &&
+          IsAllowedAndFast(Align(StackAdjustedAlignment))) {
+        Align NewAlign = getOrEnforceKnownAlignment(
+            getLoadStorePointerOperand(C[CBegin].Inst),
+            Align(StackAdjustedAlignment), DL, C[CBegin].Inst, nullptr, &DT);
+        if (NewAlign >= Alignment) {
+          LLVM_DEBUG(dbgs()
+                     << "LSV: splitByChain upgrading alloca alignment from "
+                     << Alignment.value() << " to " << NewAlign.value()
+                     << "\n");
+          Alignment = NewAlign;
+        }
+      }
+
+      if (!IsAllowedAndFast(Alignment)) {
+        LLVM_DEBUG(
+            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+                      "because its alignment is not AllowedAndFast: "
+                   << Alignment.value() << "\n");
+        continue;
+      }
+
+      if ((IsLoadChain &&
+           !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
+          (!IsLoadChain &&
+           !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
+        LLVM_DEBUG(
+            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
+                      "because !isLegalToVectorizeLoad/StoreChain.");
+        continue;
+      }
+
+      // Hooray, we can vectorize this chain!
+      Chain &NewChain = Ret.emplace_back();
+      for (unsigned I = CBegin; I <= CEnd; ++I)
+        NewChain.push_back(C[I]);
+      CBegin = CEnd; // Skip over the instructions we've added to the chain.
+      break;
+    }
+  }
+  return Ret;
+}
+
+bool Vectorizer::vectorizeChain(Chain &C) {
+  if (C.size() < 2)
     return false;
 
-  // In case if we have to shrink the pointer
-  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
-  // possible overflow and the value should fit into a smallest data type
-  // used in the cast/gep chain.
-  assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
-         OffsetB.getSignificantBits() <= NewPtrBitWidth);
+  sortChainInOffsetOrder(C);
 
-  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
-  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
-  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
+  LLVM_DEBUG({
+    dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
+    dumpChain(C);
+  });
 
-  APInt OffsetDelta = OffsetB - OffsetA;
+  Type *VecElemTy = getChainElemTy(C);
+  bool IsLoadChain = isa<LoadInst>(C[0].Inst);
+  unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
+  unsigned ChainBytes = std::accumulate(
+      C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
+        return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
+      });
+  assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
+  // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
+  // than 1 byte (e.g. VecTy == <32 x i1>).
+  Type *VecTy = FixedVectorType::get(
+      VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
+
+  Align Alignment = getLoadStoreAlignment(C[0].Inst);
+  // If this is a load/store of an alloca, we might have upgraded the alloca's
+  // alignment earlier.  Get the new alignment.
+  if (AS == DL.getAllocaAddrSpace()) {
+    Alignment = std::max(
+        Alignment,
+        getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
+                                   MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
+  }
 
-  // Check if they are based on the same pointer. That makes the offsets
-  // sufficient.
-  if (PtrA == PtrB)
-    return OffsetDelta == PtrDelta;
-
-  // Compute the necessary base pointer delta to have the necessary final delta
-  // equal to the pointer delta requested.
-  APInt BaseDelta = PtrDelta - OffsetDelta;
-
-  // Compute the distance with SCEV between the base pointers.
-  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
-  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
-  const SCEV *C = SE.getConstant(BaseDelta);
-  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
-  if (X == PtrSCEVB)
+  // All elements of the chain must have the same scalar-type size.
+#ifndef NDEBUG
+  for (const ChainElem &E : C)
+    assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
+           DL.getTypeStoreSize(VecElemTy));
+#endif
+
+  Instruction *VecInst;
+  if (IsLoadChain) {
+    // Loads get hoisted to the location of the first load in the chain.  We may
+    // also need to hoist the (transitive) operands of the loads.
+    Builder.SetInsertPoint(
+        std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) {
+          return A.Inst->comesBefore(B.Inst);
+        })->Inst);
+
+    // Chain is in offset order, so C[0] is the instr with the lowest offset,
+    // i.e. the root of the vector.
+    Value *Bitcast = Builder.CreateBitCast(
+        getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS));
+    VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
+
+    unsigned VecIdx = 0;
+    for (const ChainElem &E : C) {
+      Instruction *I = E.Inst;
+      Value *V;
+      Type *T = getLoadStoreType(I);
+      if (auto *VT = dyn_cast<FixedVectorType>(T)) {
+        auto Mask = llvm::to_vector<8>(
+            llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
+        V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
+        VecIdx += VT->getNumElements();
+      } else {
+        V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
+                                         I->getName());
+        ++VecIdx;
+      }
+      if (V->getType() != I->getType())
+        V = Builder.CreateBitOrPointerCast(V, I->getType());
+      I->replaceAllUsesWith(V);
+    }
+
+    // Finally, we need to reorder the instrs in the BB so that the (transitive)
+    // operands of VecInst appear before it.  To see why, suppose we have
+    // vectorized the following code:
+    //
+    //   ptr1  = gep a, 1
+    //   load1 = load i32 ptr1
+    //   ptr0  = gep a, 0
+    //   load0 = load i32 ptr0
+    //
+    // We will put the vectorized load at the location of the earliest load in
+    // the BB, i.e. load1.  We get:
+    //
+    //   ptr1  = gep a, 1
+    //   loadv = load <2 x i32> ptr0
+    //   load0 = extractelement loadv, 0
+    //   load1 = extractelement loadv, 1
+    //   ptr0 = gep a, 0
+    //
+    // Notice that loadv uses ptr0, which is defined *after* it!
+    reorder(VecInst);
+  } else {
+    // Stores get sunk to the location of the last store in the chain.
+    Builder.SetInsertPoint(
+        std::max_element(C.begin(), C.end(), [](auto &A, auto &B) {
+          return A.Inst->comesBefore(B.Inst);
+        })->Inst);
+
+    // Build the vector to store.
+    Value *Vec = PoisonValue::get(VecTy);
+    unsigned VecIdx = 0;
+    auto InsertElem = [&](Value *V) {
+      if (V->getType() != VecElemTy)
+        V = Builder.CreateBitOrPointerCast(V, VecElemTy);
+      Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
+    };
+    for (const ChainElem &E : C) {
+      auto I = cast<StoreInst>(E.Inst);
+      if (FixedVectorType *VT =
+              dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
+        for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
+          InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
+                                                  Builder.getInt32(J)));
+        }
+      } else {
+        InsertElem(I->getValueOperand());
+      }
+    }
+
+    // Chain is in offset order, so C[0] is the instr with the lowest offset,
+    // i.e. the root of the vector.
+    VecInst = Builder.CreateAlignedStore(
+        Vec,
+        Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst),
+                              VecTy->getPointerTo(AS)),
+        Alignment);
+  }
+
+  propagateMetadata(VecInst, C);
+
+  for (const ChainElem &E : C)
+    ToErase.push_back(E.Inst);
+
+  ++NumVectorInstructions;
+  NumScalarsVectorized += C.size();
+  return true;
+}
+
+template <bool IsLoadChain>
+bool Vectorizer::isSafeToMove(
+    Instruction *ChainElem, Instruction *ChainBegin,
+    const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
+  LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
+                    << *ChainBegin << ")\n");
+
+  assert(isa<LoadInst>(ChainElem) == IsLoadChain);
+  if (ChainElem == ChainBegin)
     return true;
 
-  // The above check will not catch the cases where one of the pointers is
-  // factorized but the other one is not, such as (C + (S * (A + B))) vs
-  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
-  // and getting the simplified difference.
-  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
-  if (C == Dist)
+  // Invariant loads can always be reordered; by definition they are not
+  // clobbered by stores.
+  if (isInvariantLoad(ChainElem))
     return true;
 
-  // Sometimes even this doesn't work, because SCEV can't always see through
-  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
-  // things the hard way.
-  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+  auto BBIt = std::next([&] {
+    if constexpr (IsLoadChain)
+      return BasicBlock::reverse_iterator(ChainElem);
+    else
+      return BasicBlock::iterator(ChainElem);
+  }());
+  auto BBItEnd = std::next([&] {
+    if constexpr (IsLoadChain)
+      return BasicBlock::reverse_iterator(ChainBegin);
+    else
+      return BasicBlock::iterator(ChainBegin);
+  }());
+
+  const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
+  const unsigned ChainElemSize =
+      DL.getTypeStoreSize(getLoadStoreType(ChainElem));
+
+  for (; BBIt != BBItEnd; ++BBIt) {
+    Instruction *I = &*BBIt;
+
+    if (!I->mayReadOrWriteMemory())
+      continue;
+
+    // Loads can be reordered with other loads.
+    if (IsLoadChain && isa<LoadInst>(I))
+      continue;
+
+    // Stores can be sunk below invariant loads.
+    if (!IsLoadChain && isInvariantLoad(I))
+      continue;
+
+    // If I is in the chain, we can tell whether it aliases ChainIt by checking
+    // what offset ChainIt accesses.  This may be better than AA is able to do.
+    //
+    // We should really only have duplicate offsets for stores (the duplicate
+    // loads should be CSE'ed), but in case we have a duplicate load, we'll
+    // split the chain so we don't have to handle this case specially.
+    if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
+      // I and ChainElem overlap if:
+      //   - I and ChainElem have the same offset, OR
+      //   - I's offset is less than ChainElem's, but I touches past the
+      //     beginning of ChainElem, OR
+      //   - ChainElem's offset is less than I's, but ChainElem touches past the
+      //     beginning of I.
+      const APInt &IOffset = OffsetIt->second;
+      unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
+      if (IOffset == ChainElemOffset ||
+          (IOffset.sle(ChainElemOffset) &&
+           (IOffset + IElemSize).sgt(ChainElemOffset)) ||
+          (ChainElemOffset.sle(IOffset) &&
+           (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
+        LLVM_DEBUG({
+          // Double check that AA also sees this alias.  If not, we probably
+          // have a bug.
+          ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
+          assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
+          dbgs() << "LSV: Found alias in chain: " << *I << "\n";
+        });
+        return false; // We found an aliasing instruction; bail.
+      }
+
+      continue; // We're confident there's no alias.
+    }
+
+    LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
+    ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
+    if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
+      LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
+                        << "  Aliasing instruction:\n"
+                        << "    " << *I << '\n'
+                        << "  Aliased instruction and pointer:\n"
+                        << "    " << *ChainElem << '\n'
+                        << "    " << *getLoadStorePointerOperand(ChainElem)
+                        << '\n');
+
+      return false;
+    }
+  }
+  return true;
 }
 
 static bool checkNoWrapFlags(Instruction *I, bool Signed) {
@@ -394,10 +1090,14 @@
 static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
                                    unsigned MatchingOpIdxA, Instruction *AddOpB,
                                    unsigned MatchingOpIdxB, bool Signed) {
-  // If both OpA and OpB is an add with NSW/NUW and with
-  // one of the operands being the same, we can guarantee that the
-  // transformation is safe if we can prove that OpA won't overflow when
-  // IdxDiff added to the other operand of OpA.
+  LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
+                    << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
+                    << MatchingOpIdxA << ", AddOpB=" << *AddOpB
+                    << ", MatchingOpIdxB=" << MatchingOpIdxB
+                    << ", Signed=" << Signed << "\n");
+  // If both OpA and OpB are adds with NSW/NUW and with one of the operands
+  // being the same, we can guarantee that the transformation is safe if we can
+  // prove that OpA won't overflow when Ret added to the other operand of OpA.
   // For example:
   //  %tmp7 = add nsw i32 %tmp2, %v0
   //  %tmp8 = sext i32 %tmp7 to i64
@@ -406,10 +1106,9 @@
   //  %tmp12 = add nsw i32 %tmp2, %tmp11
   //  %tmp13 = sext i32 %tmp12 to i64
   //
-  //  Both %tmp7 and %tmp2 has the nsw flag and the first operand
-  //  is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
-  //  because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
-  //  nsw flag.
+  //  Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
+  //  It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
+  //  1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
   assert(AddOpA->getOpcode() == Instruction::Add &&
          AddOpB->getOpcode() == Instruction::Add &&
          checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
@@ -460,24 +1159,26 @@
   return false;
 }
 
-bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
-                                             APInt PtrDelta,
-                                             unsigned Depth) const {
+std::optional<APInt> Vectorizer::gtConstantOffsetComplexAddrs(Value *PtrA,
+                                                              Value *PtrB,
+                                                              unsigned Depth) {
+  LLVM_DEBUG(dbgs() << "LSV: gtConstantOffsetComplexAddrs PtrA=" << *PtrA
+                    << " PtrB=" << *PtrB << " Depth=" << Depth << "\n");
   auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
   auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
   if (!GEPA || !GEPB)
-    return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
+    return getConstantOffsetSelects(PtrA, PtrB, Depth);
 
   // Look through GEPs after checking they're the same except for the last
   // index.
   if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
       GEPA->getPointerOperand() != GEPB->getPointerOperand())
-    return false;
+    return std::nullopt;
   gep_type_iterator GTIA = gep_type_begin(GEPA);
   gep_type_iterator GTIB = gep_type_begin(GEPB);
   for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
     if (GTIA.getOperand() != GTIB.getOperand())
-      return false;
+      return std::nullopt;
     ++GTIA;
     ++GTIB;
   }
@@ -486,23 +1187,13 @@
   Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
       OpA->getType() != OpB->getType())
-    return false;
+    return std::nullopt;
 
-  if (PtrDelta.isNegative()) {
-    if (PtrDelta.isMinSignedValue())
-      return false;
-    PtrDelta.negate();
-    std::swap(OpA, OpB);
-  }
   uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
-  if (PtrDelta.urem(Stride) != 0)
-    return false;
-  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
-  APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
 
   // Only look through a ZExt/SExt.
   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
-    return false;
+    return std::nullopt;
 
   bool Signed = isa<SExtInst>(OpA);
 
@@ -510,7 +1201,21 @@
   Value *ValA = OpA->getOperand(0);
   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
   if (!OpB || ValA->getType() != OpB->getType())
-    return false;
+    return std::nullopt;
+
+  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+  const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
+  if (IdxDiffSCEV == SE.getCouldNotCompute())
+    return std::nullopt;
+
+  ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
+  if (!IdxDiffRange.isSingleElement())
+    return std::nullopt;
+  APInt IdxDiff = *IdxDiffRange.getSingleElement();
+
+  LLVM_DEBUG(dbgs() << "LSV: gtConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
+                    << "\n");
 
   // Now we need to prove that adding IdxDiff to ValA won't overflow.
   bool Safe = false;
@@ -529,10 +1234,9 @@
   if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
       OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
       checkNoWrapFlags(OpB, Signed)) {
-    // In the checks below a matching operand in OpA and OpB is
-    // an operand which is the same in those two instructions.
-    // Below we account for possible orders of the operands of
-    // these add instructions.
+    // In the checks below a matching operand in OpA and OpB is an operand which
+    // is the same in those two instructions.  Below we account for possible
+    // orders of the operands of these add instructions.
     for (unsigned MatchingOpIdxA : {0, 1})
       for (unsigned MatchingOpIdxB : {0, 1})
         if (!Safe)
@@ -543,804 +1247,255 @@
   unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
 
   // Third attempt:
-  // If all set bits of IdxDiff or any higher order bit other than the sign bit
-  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
-  // overflow of any sort.
+  //
+  // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
+  // order bit other than the sign bit are known to be zero in ValA, we can add
+  // Diff to it while guaranteeing no overflow of any sort.
+  //
+  // If IdxDiff is negative, do the same, but swap ValA and ValB.
   if (!Safe) {
+    // When computing known bits, use the GEPs as context instructions, since
+    // they likely are in the same BB as the load/store.
+    Instruction *ContextInst = GEPA->comesBefore(GEPB) ? GEPB : GEPA;
     KnownBits Known(BitWidth);
-    computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
+    computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
+                     ContextInst, &DT);
     APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
     if (Signed)
       BitsAllowedToBeSet.clearBit(BitWidth - 1);
-    if (BitsAllowedToBeSet.ult(IdxDiff))
-      return false;
+    if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
+      return std::nullopt;
+    Safe = true;
   }
 
-  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
-  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
-  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
-  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
-  return X == OffsetSCEVB;
+  if (Safe)
+    return IdxDiff * Stride;
+  return std::nullopt;
 }
 
-bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
-                                    const APInt &PtrDelta,
-                                    unsigned Depth) const {
+std::optional<APInt>
+Vectorizer::getConstantOffsetSelects(Value *PtrA, Value *PtrB, unsigned Depth) {
   if (Depth++ == MaxDepth)
-    return false;
+    return std::nullopt;
 
   if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
     if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
-      return SelectA->getCondition() == SelectB->getCondition() &&
-             areConsecutivePointers(SelectA->getTrueValue(),
-                                    SelectB->getTrueValue(), PtrDelta, Depth) &&
-             areConsecutivePointers(SelectA->getFalseValue(),
-                                    SelectB->getFalseValue(), PtrDelta, Depth);
+      if (SelectA->getCondition() != SelectB->getCondition())
+        return std::nullopt;
+      LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
+                        << ", PtrB=" << *PtrB << ", Depth=" << Depth << "\n");
+      std::optional<APInt> TrueDiff = getConstantOffset(
+          SelectA->getTrueValue(), SelectB->getTrueValue(), Depth);
+      if (!TrueDiff.has_value())
+        return std::nullopt;
+      std::optional<APInt> FalseDiff = getConstantOffset(
+          SelectA->getFalseValue(), SelectB->getFalseValue(), Depth);
+      if (TrueDiff == FalseDiff)
+        return TrueDiff;
     }
   }
-  return false;
+  return std::nullopt;
 }
 
-void Vectorizer::reorder(Instruction *I) {
-  SmallPtrSet<Instruction *, 16> InstructionsToMove;
-  SmallVector<Instruction *, 16> Worklist;
-
-  Worklist.push_back(I);
-  while (!Worklist.empty()) {
-    Instruction *IW = Worklist.pop_back_val();
-    int NumOperands = IW->getNumOperands();
-    for (int i = 0; i < NumOperands; i++) {
-      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
-      if (!IM || IM->getOpcode() == Instruction::PHI)
-        continue;
-
-      // If IM is in another BB, no need to move it, because this pass only
-      // vectorizes instructions within one BB.
-      if (IM->getParent() != I->getParent())
-        continue;
-
-      if (!IM->comesBefore(I)) {
-        InstructionsToMove.insert(IM);
-        Worklist.push_back(IM);
-      }
+EquivalenceClassMap
+Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
+                                      BasicBlock::iterator End) {
+  EquivalenceClassMap Ret;
+
+  auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
+    const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
+    if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+      // The select's themselves are distinct instructions even if they share
+      // the same condition and evaluate to consecutive pointers for true and
+      // false values of the condition. Therefore using the select's themselves
+      // for grouping instructions would put consecutive accesses into different
+      // lists and they won't be even checked for being consecutive, and won't
+      // be vectorized.
+      return Sel->getCondition();
     }
-  }
+    return ObjPtr;
+  };
 
-  // All instructions to move should follow I. Start from I, not from begin().
-  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
-       ++BBI) {
-    if (!InstructionsToMove.count(&*BBI))
+  for (Instruction &I : make_range(Begin, End)) {
+    auto *LI = dyn_cast<LoadInst>(&I);
+    auto *SI = dyn_cast<StoreInst>(&I);
+    if (!LI && !SI)
       continue;
-    Instruction *IM = &*BBI;
-    --BBI;
-    IM->removeFromParent();
-    IM->insertBefore(I);
-  }
-}
-
-std::pair<BasicBlock::iterator, BasicBlock::iterator>
-Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
-  Instruction *C0 = Chain[0];
-  BasicBlock::iterator FirstInstr = C0->getIterator();
-  BasicBlock::iterator LastInstr = C0->getIterator();
 
-  BasicBlock *BB = C0->getParent();
-  unsigned NumFound = 0;
-  for (Instruction &I : *BB) {
-    if (!is_contained(Chain, &I))
+    if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
       continue;
 
-    ++NumFound;
-    if (NumFound == 1) {
-      FirstInstr = I.getIterator();
-    }
-    if (NumFound == Chain.size()) {
-      LastInstr = I.getIterator();
-      break;
-    }
-  }
-
-  // Range is [first, last).
-  return std::make_pair(FirstInstr, ++LastInstr);
-}
-
-void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
-  SmallVector<Instruction *, 16> Instrs;
-  for (Instruction *I : Chain) {
-    Value *PtrOperand = getLoadStorePointerOperand(I);
-    assert(PtrOperand && "Instruction must have a pointer operand.");
-    Instrs.push_back(I);
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
-      Instrs.push_back(GEP);
-  }
-
-  // Erase instructions.
-  for (Instruction *I : Instrs)
-    if (I->use_empty())
-      I->eraseFromParent();
-}
-
-std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
-Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
-                               unsigned ElementSizeBits) {
-  unsigned ElementSizeBytes = ElementSizeBits / 8;
-  unsigned SizeBytes = ElementSizeBytes * Chain.size();
-  unsigned LeftBytes = (SizeBytes - (SizeBytes % 4));
-  // If we're already a multiple of 4 bytes or the whole chain is shorter than 4
-  // bytes, then try splitting down on power-of-2 boundary.
-  if (LeftBytes == SizeBytes || LeftBytes == 0)
-    LeftBytes = PowerOf2Ceil(SizeBytes) / 2;
-  unsigned NumLeft = LeftBytes / ElementSizeBytes;
-  if (NumLeft == 0)
-    NumLeft = 1;
-  LLVM_DEBUG(dbgs() << "LSV: Splitting the chain into " << NumLeft << "+"
-                    << Chain.size() - NumLeft << " elements\n");
-  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
-}
-
-ArrayRef<Instruction *>
-Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
-  // These are in BB order, unlike Chain, which is in address order.
-  SmallVector<Instruction *, 16> MemoryInstrs;
-  SmallVector<Instruction *, 16> ChainInstrs;
-
-  bool IsLoadChain = isa<LoadInst>(Chain[0]);
-  LLVM_DEBUG({
-    for (Instruction *I : Chain) {
-      if (IsLoadChain)
-        assert(isa<LoadInst>(I) &&
-               "All elements of Chain must be loads, or all must be stores.");
-      else
-        assert(isa<StoreInst>(I) &&
-               "All elements of Chain must be loads, or all must be stores.");
-    }
-  });
-
-  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
-    if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
-      ChainInstrs.push_back(&I);
+    if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
+        (SI && !TTI.isLegalToVectorizeStore(SI)))
       continue;
-    }
-    if (!isGuaranteedToTransferExecutionToSuccessor(&I)) {
-      LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
-                        << I << '\n');
-      break;
-    }
-    if (I.mayReadOrWriteMemory())
-      MemoryInstrs.push_back(&I);
-  }
-
-  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
-  unsigned ChainInstrIdx = 0;
-  Instruction *BarrierMemoryInstr = nullptr;
-
-  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
-    Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
-
-    // If a barrier memory instruction was found, chain instructions that follow
-    // will not be added to the valid prefix.
-    if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
-      break;
-
-    // Check (in BB order) if any instruction prevents ChainInstr from being
-    // vectorized. Find and store the first such "conflicting" instruction.
-    for (Instruction *MemInstr : MemoryInstrs) {
-      // If a barrier memory instruction was found, do not check past it.
-      if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
-        break;
 
-      auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
-      auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
-      if (MemLoad && ChainLoad)
-        continue;
-
-      // We can ignore the alias if the we have a load store pair and the load
-      // is known to be invariant. The load cannot be clobbered by the store.
-      auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
-        return LI->hasMetadata(LLVMContext::MD_invariant_load);
-      };
-
-      if (IsLoadChain) {
-        // We can ignore the alias as long as the load comes before the store,
-        // because that means we won't be moving the load past the store to
-        // vectorize it (the vectorized load is inserted at the location of the
-        // first load in the chain).
-        if (ChainInstr->comesBefore(MemInstr) ||
-            (ChainLoad && IsInvariantLoad(ChainLoad)))
-          continue;
-      } else {
-        // Same case, but in reverse.
-        if (MemInstr->comesBefore(ChainInstr) ||
-            (MemLoad && IsInvariantLoad(MemLoad)))
-          continue;
-      }
-
-      ModRefInfo MR =
-          AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
-      if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
-        LLVM_DEBUG({
-          dbgs() << "LSV: Found alias:\n"
-                    "  Aliasing instruction:\n"
-                 << "  " << *MemInstr << '\n'
-                 << "  Aliased instruction and pointer:\n"
-                 << "  " << *ChainInstr << '\n'
-                 << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
-        });
-        // Save this aliasing memory instruction as a barrier, but allow other
-        // instructions that precede the barrier to be vectorized with this one.
-        BarrierMemoryInstr = MemInstr;
-        break;
-      }
-    }
-    // Continue the search only for store chains, since vectorizing stores that
-    // precede an aliasing load is valid. Conversely, vectorizing loads is valid
-    // up to an aliasing store, but should not pull loads from further down in
-    // the basic block.
-    if (IsLoadChain && BarrierMemoryInstr) {
-      // The BarrierMemoryInstr is a store that precedes ChainInstr.
-      assert(BarrierMemoryInstr->comesBefore(ChainInstr));
-      break;
-    }
-  }
-
-  // Find the largest prefix of Chain whose elements are all in
-  // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
-  // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
-  // order.)
-  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
-      ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
-  unsigned ChainIdx = 0;
-  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
-    if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
-      break;
-  }
-  return Chain.slice(0, ChainIdx);
-}
-
-static ChainID getChainID(const Value *Ptr) {
-  const Value *ObjPtr = getUnderlyingObject(Ptr);
-  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
-    // The select's themselves are distinct instructions even if they share the
-    // same condition and evaluate to consecutive pointers for true and false
-    // values of the condition. Therefore using the select's themselves for
-    // grouping instructions would put consecutive accesses into different lists
-    // and they won't be even checked for being consecutive, and won't be
-    // vectorized.
-    return Sel->getCondition();
-  }
-  return ObjPtr;
-}
-
-std::pair<InstrListMap, InstrListMap>
-Vectorizer::collectInstructions(BasicBlock *BB) {
-  InstrListMap LoadRefs;
-  InstrListMap StoreRefs;
-
-  for (Instruction &I : *BB) {
-    if (!I.mayReadOrWriteMemory())
+    Type *Ty = getLoadStoreType(&I);
+    if (!VectorType::isValidElementType(Ty->getScalarType()))
       continue;
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-      if (!LI->isSimple())
-        continue;
-
-      // Skip if it's not legal.
-      if (!TTI.isLegalToVectorizeLoad(LI))
-        continue;
-
-      Type *Ty = LI->getType();
-      if (!VectorType::isValidElementType(Ty->getScalarType()))
-        continue;
-
-      // Skip weird non-byte sizes. They probably aren't worth the effort of
-      // handling correctly.
-      unsigned TySize = DL.getTypeSizeInBits(Ty);
-      if ((TySize % 8) != 0)
-        continue;
-
-      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
-      // functions are currently using an integer type for the vectorized
-      // load/store, and does not support casting between the integer type and a
-      // vector of pointers (e.g. i64 to <2 x i16*>)
-      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
-        continue;
-
-      Value *Ptr = LI->getPointerOperand();
-      unsigned AS = Ptr->getType()->getPointerAddressSpace();
-      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-
-      unsigned VF = VecRegSize / TySize;
-      VectorType *VecTy = dyn_cast<VectorType>(Ty);
-
-      // No point in looking at these if they're too big to vectorize.
-      if (TySize > VecRegSize / 2 ||
-          (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
-        continue;
-
-      // Save the load locations.
-      const ChainID ID = getChainID(Ptr);
-      LoadRefs[ID].push_back(LI);
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
-      if (!SI->isSimple())
-        continue;
-
-      // Skip if it's not legal.
-      if (!TTI.isLegalToVectorizeStore(SI))
-        continue;
-
-      Type *Ty = SI->getValueOperand()->getType();
-      if (!VectorType::isValidElementType(Ty->getScalarType()))
-        continue;
-
-      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
-      // functions are currently using an integer type for the vectorized
-      // load/store, and does not support casting between the integer type and a
-      // vector of pointers (e.g. i64 to <2 x i16*>)
-      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
-        continue;
-
-      // Skip weird non-byte sizes. They probably aren't worth the effort of
-      // handling correctly.
-      unsigned TySize = DL.getTypeSizeInBits(Ty);
-      if ((TySize % 8) != 0)
-        continue;
-
-      Value *Ptr = SI->getPointerOperand();
-      unsigned AS = Ptr->getType()->getPointerAddressSpace();
-      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-
-      unsigned VF = VecRegSize / TySize;
-      VectorType *VecTy = dyn_cast<VectorType>(Ty);
-
-      // No point in looking at these if they're too big to vectorize.
-      if (TySize > VecRegSize / 2 ||
-          (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
-        continue;
-
-      // Save store location.
-      const ChainID ID = getChainID(Ptr);
-      StoreRefs[ID].push_back(SI);
-    }
-  }
-
-  return {LoadRefs, StoreRefs};
-}
-
-bool Vectorizer::vectorizeChains(InstrListMap &Map) {
-  bool Changed = false;
-
-  for (const std::pair<ChainID, InstrList> &Chain : Map) {
-    unsigned Size = Chain.second.size();
-    if (Size < 2)
+    // Skip weird non-byte sizes. They probably aren't worth the effort of
+    // handling correctly.
+    unsigned TySize = DL.getTypeSizeInBits(Ty);
+    if ((TySize % 8) != 0)
       continue;
 
-    LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
-
-    // Process the stores in chunks of 64.
-    for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
-      unsigned Len = std::min<unsigned>(CE - CI, 64);
-      ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
-      Changed |= vectorizeInstructions(Chunk);
-    }
-  }
-
-  return Changed;
-}
-
-bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
-  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
-                    << " instructions.\n");
-  SmallVector<int, 16> Heads, Tails;
-  int ConsecutiveChain[64];
-
-  // Do a quadratic search on all of the given loads/stores and find all of the
-  // pairs of loads/stores that follow each other.
-  for (int i = 0, e = Instrs.size(); i < e; ++i) {
-    ConsecutiveChain[i] = -1;
-    for (int j = e - 1; j >= 0; --j) {
-      if (i == j)
-        continue;
-
-      if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
-        if (ConsecutiveChain[i] != -1) {
-          int CurDistance = std::abs(ConsecutiveChain[i] - i);
-          int NewDistance = std::abs(ConsecutiveChain[i] - j);
-          if (j < i || NewDistance > CurDistance)
-            continue; // Should not insert.
-        }
+    // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+    // functions are currently using an integer type for the vectorized
+    // load/store, and does not support casting between the integer type and a
+    // vector of pointers (e.g. i64 to <2 x i16*>)
+    if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+      continue;
 
-        Tails.push_back(j);
-        Heads.push_back(i);
-        ConsecutiveChain[i] = j;
-      }
-    }
-  }
+    Value *Ptr = getLoadStorePointerOperand(&I);
+    unsigned AS = Ptr->getType()->getPointerAddressSpace();
+    unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
 
-  bool Changed = false;
-  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
+    unsigned VF = VecRegSize / TySize;
+    VectorType *VecTy = dyn_cast<VectorType>(Ty);
 
-  for (int Head : Heads) {
-    if (InstructionsProcessed.count(Instrs[Head]))
-      continue;
-    bool LongerChainExists = false;
-    for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
-      if (Head == Tails[TIt] &&
-          !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
-        LongerChainExists = true;
-        break;
-      }
-    if (LongerChainExists)
+    // Only handle power-of-two sized elements.
+    if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
+        (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
       continue;
 
-    // We found an instr that starts a chain. Now follow the chain and try to
-    // vectorize it.
-    SmallVector<Instruction *, 16> Operands;
-    int I = Head;
-    while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
-      if (InstructionsProcessed.count(Instrs[I]))
-        break;
-
-      Operands.push_back(Instrs[I]);
-      I = ConsecutiveChain[I];
-    }
-
-    bool Vectorized = false;
-    if (isa<LoadInst>(*Operands.begin()))
-      Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
-    else
-      Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
+    // No point in looking at these if they're too big to vectorize.
+    if (TySize > VecRegSize / 2 ||
+        (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+      continue;
 
-    Changed |= Vectorized;
+    Ret[{getUnderlyingObject(Ptr), AS,
+         DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
+         /*IsLoad=*/LI != nullptr}]
+        .push_back(&I);
   }
 
-  return Changed;
+  return Ret;
 }
 
-bool Vectorizer::vectorizeStoreChain(
-    ArrayRef<Instruction *> Chain,
-    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
-  StoreInst *S0 = cast<StoreInst>(Chain[0]);
+std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
+  if (Instrs.empty())
+    return {};
 
-  // If the vector has an int element, default to int for the whole store.
-  Type *StoreTy = nullptr;
-  for (Instruction *I : Chain) {
-    StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
-    if (StoreTy->isIntOrIntVectorTy())
-      break;
-
-    if (StoreTy->isPtrOrPtrVectorTy()) {
-      StoreTy = Type::getIntNTy(F.getParent()->getContext(),
-                                DL.getTypeSizeInBits(StoreTy));
-      break;
-    }
-  }
-  assert(StoreTy && "Failed to find store type");
-
-  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
-  unsigned AS = S0->getPointerAddressSpace();
-  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-  unsigned VF = VecRegSize / Sz;
-  unsigned ChainSize = Chain.size();
-  Align Alignment = S0->getAlign();
-
-  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
-    InstructionsProcessed->insert(Chain.begin(), Chain.end());
-    return false;
-  }
-
-  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
-  if (NewChain.empty()) {
-    // No vectorization possible.
-    InstructionsProcessed->insert(Chain.begin(), Chain.end());
-    return false;
-  }
-  if (NewChain.size() == 1) {
-    // Failed after the first instruction. Discard it and try the smaller chain.
-    InstructionsProcessed->insert(NewChain.front());
-    return false;
-  }
+  unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
+  unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
 
-  // Update Chain to the valid vectorizable subchain.
-  Chain = NewChain;
-  ChainSize = Chain.size();
-
-  // Check if it's legal to vectorize this chain. If not, split the chain and
-  // try again.
-  unsigned EltSzInBytes = Sz / 8;
-  unsigned SzInBytes = EltSzInBytes * ChainSize;
-
-  FixedVectorType *VecTy;
-  auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
-  if (VecStoreTy)
-    VecTy = FixedVectorType::get(StoreTy->getScalarType(),
-                                 Chain.size() * VecStoreTy->getNumElements());
-  else
-    VecTy = FixedVectorType::get(StoreTy, Chain.size());
-
-  // If it's more than the max vector size or the target has a better
-  // vector factor, break it into two pieces.
-  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
-  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
-    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
-                         " Creating two separate arrays.\n");
-    bool Vectorized = false;
-    Vectorized |=
-        vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
-    Vectorized |=
-        vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
-    return Vectorized;
+#ifndef NDEBUG
+  // Check that Instrs is in BB order and all have the same addr space.
+  for (size_t I = 1; I < Instrs.size(); ++I) {
+    assert(Instrs[I - 1]->comesBefore(Instrs[I]));
+    assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
   }
+#endif
 
-  LLVM_DEBUG({
-    dbgs() << "LSV: Stores to vectorize:\n";
-    for (Instruction *I : Chain)
-      dbgs() << "  " << *I << "\n";
-  });
-
-  // We won't try again to vectorize the elements of the chain, regardless of
-  // whether we succeed below.
-  InstructionsProcessed->insert(Chain.begin(), Chain.end());
-
-  // If the store is going to be misaligned, don't vectorize it.
-  unsigned RelativeSpeed;
-  if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
-    if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
-      unsigned SpeedBefore;
-      accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
-      if (SpeedBefore > RelativeSpeed)
-        return false;
-
-      auto Chains = splitOddVectorElts(Chain, Sz);
-      bool Vectorized = false;
-      Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
-      Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
-      return Vectorized;
+  // Machinery to build an MRU-hashtable of Chains.
+  //
+  // (Ideally this could be done with MapVector, but as currently implemented,
+  // moving an element to the front of a MapVector is O(n).)
+  struct InstrListElem : ilist_node<InstrListElem>,
+                         std::pair<Instruction *, Chain> {
+    explicit InstrListElem(Instruction *I)
+        : std::pair<Instruction *, Chain>(I, {}) {}
+  };
+  struct InstrListElemDenseMapInfo {
+    using PtrInfo = DenseMapInfo<InstrListElem *>;
+    using IInfo = DenseMapInfo<Instruction *>;
+    static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
+    static InstrListElem *getTombstoneKey() {
+      return PtrInfo::getTombstoneKey();
     }
-
-    Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
-                                                Align(StackAdjustedAlignment),
-                                                DL, S0, nullptr, &DT);
-    if (NewAlign >= Alignment)
-      Alignment = NewAlign;
-    else
-      return false;
-  }
-
-  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
-    auto Chains = splitOddVectorElts(Chain, Sz);
-    bool Vectorized = false;
-    Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
-    Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
-    return Vectorized;
-  }
-
-  BasicBlock::iterator First, Last;
-  std::tie(First, Last) = getBoundaryInstrs(Chain);
-  Builder.SetInsertPoint(&*Last);
-
-  Value *Vec = PoisonValue::get(VecTy);
-
-  if (VecStoreTy) {
-    unsigned VecWidth = VecStoreTy->getNumElements();
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
-      StoreInst *Store = cast<StoreInst>(Chain[I]);
-      for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
-        unsigned NewIdx = J + I * VecWidth;
-        Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
-                                                      Builder.getInt32(J));
-        if (Extract->getType() != StoreTy->getScalarType())
-          Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
-
-        Value *Insert =
-            Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
-        Vec = Insert;
-      }
+    static unsigned getHashValue(const InstrListElem *E) {
+      return IInfo::getHashValue(E->first);
     }
-  } else {
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
-      StoreInst *Store = cast<StoreInst>(Chain[I]);
-      Value *Extract = Store->getValueOperand();
-      if (Extract->getType() != StoreTy->getScalarType())
-        Extract =
-            Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
-
-      Value *Insert =
-          Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
-      Vec = Insert;
+    static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
+      if (A == getEmptyKey() || B == getEmptyKey())
+        return A == getEmptyKey() && B == getEmptyKey();
+      if (A == getTombstoneKey() || B == getTombstoneKey())
+        return A == getTombstoneKey() && B == getTombstoneKey();
+      return IInfo::isEqual(A->first, B->first);
     }
-  }
-
-  StoreInst *SI = Builder.CreateAlignedStore(
-    Vec,
-    Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
-    Alignment);
-  propagateMetadata(SI, Chain);
-
-  eraseInstructions(Chain);
-  ++NumVectorInstructions;
-  NumScalarsVectorized += Chain.size();
-  return true;
-}
-
-bool Vectorizer::vectorizeLoadChain(
-    ArrayRef<Instruction *> Chain,
-    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
-  LoadInst *L0 = cast<LoadInst>(Chain[0]);
-
-  // If the vector has an int element, default to int for the whole load.
-  Type *LoadTy = nullptr;
-  for (const auto &V : Chain) {
-    LoadTy = cast<LoadInst>(V)->getType();
-    if (LoadTy->isIntOrIntVectorTy())
-      break;
-
-    if (LoadTy->isPtrOrPtrVectorTy()) {
-      LoadTy = Type::getIntNTy(F.getParent()->getContext(),
-                               DL.getTypeSizeInBits(LoadTy));
-      break;
+  };
+  SpecificBumpPtrAllocator<InstrListElem> Allocator;
+  simple_ilist<InstrListElem> MRU;
+  DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
+
+  // Compare each instruction in `instrs` to leader of the N most recently-used
+  // chains.  This limits the O(n^2) behavior of this pass while also allowing
+  // us to build arbitrarily long chains.
+  for (Instruction *I : Instrs) {
+    constexpr size_t MaxChainsToTry = 64;
+
+    bool MatchFound = false;
+    auto ChainIter = MRU.begin();
+    for (int J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
+         ++J, ++ChainIter) {
+      std::optional<APInt> Offset =
+          getConstantOffset(getLoadStorePointerOperand(ChainIter->first),
+                            getLoadStorePointerOperand(I));
+      if (Offset.has_value()) {
+        // `Offset` might not have the expected number of bits, if e.g. AS has a
+        // different number of bits than opaque pointers.
+        ChainIter->second.push_back(
+            ChainElem{I, Offset.value().sextOrTrunc(ASPtrBits)});
+        // Move ChainIter to the front of the MRU list.
+        MRU.remove(*ChainIter);
+        MRU.push_front(*ChainIter);
+        MatchFound = true;
+        break;
+      }
     }
-  }
-  assert(LoadTy && "Can't determine LoadInst type from chain");
 
-  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
-  unsigned AS = L0->getPointerAddressSpace();
-  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-  unsigned VF = VecRegSize / Sz;
-  unsigned ChainSize = Chain.size();
-  Align Alignment = L0->getAlign();
-
-  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
-    InstructionsProcessed->insert(Chain.begin(), Chain.end());
-    return false;
-  }
-
-  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
-  if (NewChain.empty()) {
-    // No vectorization possible.
-    InstructionsProcessed->insert(Chain.begin(), Chain.end());
-    return false;
-  }
-  if (NewChain.size() == 1) {
-    // Failed after the first instruction. Discard it and try the smaller chain.
-    InstructionsProcessed->insert(NewChain.front());
-    return false;
-  }
-
-  // Update Chain to the valid vectorizable subchain.
-  Chain = NewChain;
-  ChainSize = Chain.size();
-
-  // Check if it's legal to vectorize this chain. If not, split the chain and
-  // try again.
-  unsigned EltSzInBytes = Sz / 8;
-  unsigned SzInBytes = EltSzInBytes * ChainSize;
-  VectorType *VecTy;
-  auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
-  if (VecLoadTy)
-    VecTy = FixedVectorType::get(LoadTy->getScalarType(),
-                                 Chain.size() * VecLoadTy->getNumElements());
-  else
-    VecTy = FixedVectorType::get(LoadTy, Chain.size());
-
-  // If it's more than the max vector size or the target has a better
-  // vector factor, break it into two pieces.
-  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
-  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
-    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
-                         " Creating two separate arrays.\n");
-    bool Vectorized = false;
-    Vectorized |=
-        vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
-    Vectorized |=
-        vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
-    return Vectorized;
-  }
-
-  // We won't try again to vectorize the elements of the chain, regardless of
-  // whether we succeed below.
-  InstructionsProcessed->insert(Chain.begin(), Chain.end());
-
-  // If the load is going to be misaligned, don't vectorize it.
-  unsigned RelativeSpeed;
-  if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
-    if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
-      unsigned SpeedBefore;
-      accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
-      if (SpeedBefore > RelativeSpeed)
-        return false;
-
-      auto Chains = splitOddVectorElts(Chain, Sz);
-      bool Vectorized = false;
-      Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
-      Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
-      return Vectorized;
+    if (!MatchFound) {
+      APInt ZeroOffset(ASPtrBits, 0);
+      InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
+      E->second.push_back(ChainElem{I, ZeroOffset});
+      MRU.push_front(*E);
+      Chains.insert(E);
     }
-
-    Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
-                                                Align(StackAdjustedAlignment),
-                                                DL, L0, nullptr, &DT);
-    if (NewAlign >= Alignment)
-      Alignment = NewAlign;
-    else
-      return false;
   }
 
-  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
-    auto Chains = splitOddVectorElts(Chain, Sz);
-    bool Vectorized = false;
-    Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
-    Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
-    return Vectorized;
-  }
-
-  LLVM_DEBUG({
-    dbgs() << "LSV: Loads to vectorize:\n";
-    for (Instruction *I : Chain)
-      I->dump();
-  });
+  std::vector<Chain> Ret;
+  Ret.reserve(Chains.size());
+  // Iterate over MRU rather than Chains so the order is deterministic.
+  for (auto &E : MRU)
+    if (E.second.size() > 1)
+      Ret.push_back(std::move(E.second));
+  return Ret;
+}
 
-  // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
-  // Last may have changed since then, but the value of First won't have.  If it
-  // matters, we could compute getBoundaryInstrs only once and reuse it here.
-  BasicBlock::iterator First, Last;
-  std::tie(First, Last) = getBoundaryInstrs(Chain);
-  Builder.SetInsertPoint(&*First);
-
-  Value *Bitcast =
-      Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
-  LoadInst *LI =
-      Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
-  propagateMetadata(LI, Chain);
-
-  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
-    Value *CV = Chain[I];
-    Value *V;
-    if (VecLoadTy) {
-      // Extract a subvector using shufflevector.
-      unsigned VecWidth = VecLoadTy->getNumElements();
-      auto Mask =
-          llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
-      V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
-    } else {
-      V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
-    }
+std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
+                                                   unsigned Depth) {
+  LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
+                    << ", PtrB=" << *PtrB << ", Depth=" << Depth << "\n");
+  unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
+  APInt OffsetA(OffsetBitWidth, 0);
+  APInt OffsetB(OffsetBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+    return std::nullopt;
 
-    if (V->getType() != CV->getType()) {
-      V = Builder.CreateBitOrPointerCast(V, CV->getType());
-    }
+  // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
+  // should properly handle a possible overflow and the value should fit into
+  // the smallest data type used in the cast/gep chain.
+  assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
+         OffsetB.getSignificantBits() <= NewPtrBitWidth);
 
-    // Replace the old instruction.
-    CV->replaceAllUsesWith(V);
+  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
+  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
+  if (PtrA == PtrB)
+    return OffsetB - OffsetA;
+
+  // Try to compute B - A.
+  const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
+  if (DistScev != SE.getCouldNotCompute()) {
+    LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
+    ConstantRange DistRange = SE.getSignedRange(DistScev);
+    if (DistRange.isSingleElement())
+      return OffsetB - OffsetA + *DistRange.getSingleElement();
   }
-
-  // Since we might have opaque pointers we might end up using the pointer
-  // operand of the first load (wrt. memory loaded) for the vector load. Since
-  // this first load might not be the first in the block we potentially need to
-  // reorder the pointer operand (and its operands). If we have a bitcast though
-  // it might be before the load and should be the reorder start instruction.
-  // "Might" because for opaque pointers the "bitcast" is just the first loads
-  // pointer operand, as oppposed to something we inserted at the right position
-  // ourselves.
-  Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
-  reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
-
-  eraseInstructions(Chain);
-
-  ++NumVectorInstructions;
-  NumScalarsVectorized += Chain.size();
-  return true;
-}
-
-bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
-                                    Align Alignment, unsigned &RelativeSpeed) {
-  RelativeSpeed = 0;
-  if (Alignment.value() % SzInBytes == 0)
-    return false;
-
-  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
-                                                   SzInBytes * 8, AddressSpace,
-                                                   Alignment, &RelativeSpeed);
-  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
-                    << " with relative speed = " << RelativeSpeed << '\n';);
-  return !Allows || !RelativeSpeed;
+  std::optional<APInt> Diff = gtConstantOffsetComplexAddrs(PtrA, PtrB, Depth);
+  if (Diff.has_value())
+    return OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth());
+  return std::nullopt;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1277,26 +1277,26 @@
 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: sdivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX8-NEXT:    s_ashr_i32 s16, s13, 31
-; GFX8-NEXT:    s_add_u32 s0, s8, s2
-; GFX8-NEXT:    s_addc_u32 s1, s9, s2
-; GFX8-NEXT:    s_add_u32 s6, s12, s16
+; GFX8-NEXT:    s_ashr_i32 s4, s13, 31
+; GFX8-NEXT:    s_ashr_i32 s16, s1, 31
+; GFX8-NEXT:    s_add_u32 s12, s12, s4
+; GFX8-NEXT:    s_addc_u32 s13, s13, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, s16
 ; GFX8-NEXT:    s_mov_b32 s17, s16
-; GFX8-NEXT:    s_addc_u32 s7, s13, s16
-; GFX8-NEXT:    s_xor_b64 s[8:9], s[6:7], s[16:17]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX8-NEXT:    s_mov_b32 s3, s2
-; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
+; GFX8-NEXT:    s_addc_u32 s1, s1, s16
+; GFX8-NEXT:    s_xor_b64 s[6:7], s[0:1], s[16:17]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX8-NEXT:    s_mov_b32 s5, s4
+; GFX8-NEXT:    s_xor_b64 s[12:13], s[12:13], s[4:5]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_sub_u32 s6, 0, s8
-; GFX8-NEXT:    s_subb_u32 s7, 0, s9
-; GFX8-NEXT:    s_xor_b64 s[18:19], s[2:3], s[16:17]
+; GFX8-NEXT:    s_sub_u32 s18, 0, s6
+; GFX8-NEXT:    s_subb_u32 s19, 0, s7
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1304,12 +1304,10 @@
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    s_ashr_i32 s16, s15, 31
-; GFX8-NEXT:    s_mov_b32 s17, s16
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2]
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2]
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
@@ -1332,14 +1330,16 @@
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2]
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2]
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX8-NEXT:    s_xor_b64 s[18:19], s[4:5], s[16:17]
+; GFX8-NEXT:    s_ashr_i32 s16, s3, 31
+; GFX8-NEXT:    s_mov_b32 s17, s16
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
@@ -1377,46 +1377,46 @@
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s12, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
-; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_ashr_i32 s12, s15, 31
 ; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s13, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v7
 ; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s8, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s6, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
 ; GFX8-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], 1, v4
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v1
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT:    s_add_u32 s0, s10, s12
-; GFX8-NEXT:    s_addc_u32 s1, s11, s12
-; GFX8-NEXT:    s_add_u32 s10, s14, s16
-; GFX8-NEXT:    s_addc_u32 s11, s15, s16
-; GFX8-NEXT:    s_xor_b64 s[10:11], s[10:11], s[16:17]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v14, s11
+; GFX8-NEXT:    s_add_u32 s0, s14, s12
+; GFX8-NEXT:    s_addc_u32 s1, s15, s12
+; GFX8-NEXT:    s_add_u32 s2, s2, s16
+; GFX8-NEXT:    s_addc_u32 s3, s3, s16
+; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v14, s3
 ; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s10
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s8, v8
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s6, v8
 ; GFX8-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v14
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v5
@@ -1431,15 +1431,15 @@
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v13, v0
 ; GFX8-NEXT:    s_mov_b32 s13, s12
-; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
-; GFX8-NEXT:    s_sub_u32 s3, 0, s10
+; GFX8-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
+; GFX8-NEXT:    s_sub_u32 s5, 0, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v13, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v13, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v12
-; GFX8-NEXT:    s_subb_u32 s20, 0, s11
+; GFX8-NEXT:    s_subb_u32 s20, 0, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v3, v10, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v5, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v5, v[1:2]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v15, s[0:1]
 ; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], s20, v13, v[1:2]
@@ -1468,22 +1468,22 @@
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v13, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v8, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s18, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v5, v[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
 ; GFX8-NEXT:    v_xor_b32_e32 v9, s19, v10
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s19
 ; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s18, v1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v9, v10, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v4, s2, v7
+; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v7
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v5, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v8, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v11, v8, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v5, v2
-; GFX8-NEXT:    v_xor_b32_e32 v6, s2, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
@@ -1503,56 +1503,56 @@
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v10, s2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, s8, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v10, s4
+; GFX8-NEXT:    v_mul_lo_u32 v7, s7, v2
+; GFX8-NEXT:    v_mul_lo_u32 v8, s6, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v6, s8, v2
+; GFX8-NEXT:    v_mul_hi_u32 v6, s6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v3
-; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s7, v3
+; GFX8-NEXT:    v_mul_hi_u32 v2, s7, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    v_mul_hi_u32 v8, s8, v3
+; GFX8-NEXT:    v_mul_hi_u32 v8, s6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v9, s9, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v8, 0
+; GFX8-NEXT:    v_mul_hi_u32 v9, s7, v3
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s10, v9, v[3:4]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s9
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s11, v8, v[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v10, s7
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s6, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s9, v6
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s7, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
 ; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s10, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s2, v2
 ; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v8
 ; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s10, v11
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s2, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v16, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v16, s[0:1], 1, v13
 ; GFX8-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -1578,38 +1578,37 @@
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s12, v6
 ; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v8, vcc
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, s5
-; GFX8-NEXT:    v_mov_b32_e32 v8, s4
+; GFX8-NEXT:    v_mov_b32_e32 v8, s8
+; GFX8-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdivrem_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX9-NEXT:    s_ashr_i32 s16, s13, 31
-; GFX9-NEXT:    s_add_u32 s0, s8, s2
-; GFX9-NEXT:    s_addc_u32 s1, s9, s2
-; GFX9-NEXT:    s_add_u32 s6, s12, s16
+; GFX9-NEXT:    s_ashr_i32 s4, s13, 31
+; GFX9-NEXT:    s_ashr_i32 s16, s1, 31
+; GFX9-NEXT:    s_add_u32 s12, s12, s4
+; GFX9-NEXT:    s_addc_u32 s13, s13, s4
+; GFX9-NEXT:    s_add_u32 s0, s0, s16
 ; GFX9-NEXT:    s_mov_b32 s17, s16
-; GFX9-NEXT:    s_addc_u32 s7, s13, s16
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[6:7], s[16:17]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
+; GFX9-NEXT:    s_addc_u32 s1, s1, s16
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[16:17]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[12:13], s[4:5]
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_u32 s6, 0, s8
-; GFX9-NEXT:    s_subb_u32 s7, 0, s9
-; GFX9-NEXT:    s_xor_b64 s[18:19], s[2:3], s[16:17]
+; GFX9-NEXT:    s_sub_u32 s18, 0, s6
+; GFX9-NEXT:    s_subb_u32 s19, 0, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1617,12 +1616,10 @@
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    s_ashr_i32 s16, s15, 31
-; GFX9-NEXT:    s_mov_b32 s17, s16
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
@@ -1644,15 +1641,17 @@
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s7
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v3, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    s_xor_b64 s[18:19], s[4:5], s[16:17]
+; GFX9-NEXT:    s_ashr_i32 s16, s3, 31
+; GFX9-NEXT:    s_mov_b32 s17, s16
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1688,47 +1687,47 @@
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v4, v3, v0, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s6, v4, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, s12, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3]
-; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s7, v5, v[2:3]
+; GFX9-NEXT:    s_ashr_i32 s12, s15, 31
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s13, v2
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s8, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s6, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[0:1]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s10, s12
-; GFX9-NEXT:    s_addc_u32 s1, s11, s12
-; GFX9-NEXT:    s_add_u32 s10, s14, s16
-; GFX9-NEXT:    s_addc_u32 s11, s15, s16
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[16:17]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s11
+; GFX9-NEXT:    s_add_u32 s0, s14, s12
+; GFX9-NEXT:    s_addc_u32 s1, s15, s12
+; GFX9-NEXT:    s_add_u32 s2, s2, s16
+; GFX9-NEXT:    s_addc_u32 s3, s3, s16
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s10
-; GFX9-NEXT:    v_subrev_co_u32_e32 v16, vcc, s8, v9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v16, vcc, s6, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v15
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v7
@@ -1743,14 +1742,14 @@
 ; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v1
 ; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
-; GFX9-NEXT:    s_sub_u32 s3, 0, s10
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v14, 0
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
+; GFX9-NEXT:    s_sub_u32 s5, 0, s2
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v13
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_subb_u32 s14, 0, s11
+; GFX9-NEXT:    s_subb_u32 s14, 0, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v13, v[2:3]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v13, v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v11, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v14, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3]
@@ -1778,23 +1777,23 @@
 ; GFX9-NEXT:    v_add3_u32 v2, v4, v3, v2
 ; GFX9-NEXT:    v_add_co_u32_e64 v11, s[0:1], v14, v1
 ; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[0:1], v13, v2, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v11, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v11, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v9, s18, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v12, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v7, s19, v7
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s14, v11, v[1:2]
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s19
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s18, v9
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v10, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v5, s2, v8
+; GFX9-NEXT:    v_xor_b32_e32 v5, s4, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v11, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v12, v3
-; GFX9-NEXT:    v_xor_b32_e32 v6, s2, v6
+; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
@@ -1813,55 +1812,55 @@
 ; GFX9-NEXT:    v_add3_u32 v4, v8, v7, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v11, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v12, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, s9, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s8, v4
-; GFX9-NEXT:    v_mul_hi_u32 v10, s8, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s9, v3
-; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, s7, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, s6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v10, s6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX9-NEXT:    v_mul_hi_u32 v12, s7, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, s9, v4
+; GFX9-NEXT:    v_mul_lo_u32 v10, s7, v4
 ; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_mul_hi_u32 v8, s8, v4
-; GFX9-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-NEXT:    v_mul_hi_u32 v8, s6, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, s4
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v3, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s10, v11, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s2, v5
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s4, v5
 ; GFX9-NEXT:    v_add_u32_e32 v8, v10, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v8, v7, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s10, v9, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v10, s9
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s8, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s11, v11, v[7:8]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v10, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
-; GFX9-NEXT:    v_sub_u32_e32 v7, s9, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
+; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v12, vcc, s10, v3
+; GFX9-NEXT:    v_subrev_co_u32_e32 v12, vcc, s2, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v13, s[0:1], 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v11
 ; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v13
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v12
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v12
 ; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v13
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s10, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v13
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s2, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v17, s[0:1], 1, v14
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
@@ -1887,45 +1886,46 @@
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s12
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s12, v7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, v8, v9, vcc
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
-; GFX9-NEXT:    global_store_dwordx4 v0, v[5:8], s[6:7]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[8:9]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[5:8], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX10-NEXT:    s_ashr_i32 s6, s13, 31
-; GFX10-NEXT:    s_add_u32 s0, s8, s2
-; GFX10-NEXT:    s_addc_u32 s1, s9, s2
-; GFX10-NEXT:    s_add_u32 s8, s12, s6
-; GFX10-NEXT:    s_mov_b32 s7, s6
-; GFX10-NEXT:    s_addc_u32 s9, s13, s6
-; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX10-NEXT:    s_sub_u32 s20, 0, s8
-; GFX10-NEXT:    s_subb_u32 s21, 0, s9
-; GFX10-NEXT:    s_ashr_i32 s12, s11, 31
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX10-NEXT:    s_xor_b64 s[18:19], s[2:3], s[6:7]
-; GFX10-NEXT:    s_ashr_i32 s16, s15, 31
+; GFX10-NEXT:    s_ashr_i32 s16, s1, 31
+; GFX10-NEXT:    s_ashr_i32 s4, s13, 31
+; GFX10-NEXT:    s_mov_b32 s17, s16
+; GFX10-NEXT:    s_add_u32 s12, s12, s4
+; GFX10-NEXT:    s_addc_u32 s13, s13, s4
+; GFX10-NEXT:    s_add_u32 s0, s0, s16
+; GFX10-NEXT:    s_addc_u32 s1, s1, s16
+; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    s_xor_b64 s[6:7], s[0:1], s[16:17]
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[4:5]
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX10-NEXT:    s_sub_u32 s20, 0, s6
+; GFX10-NEXT:    s_subb_u32 s21, 0, s7
+; GFX10-NEXT:    s_ashr_i32 s12, s15, 31
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX10-NEXT:    s_xor_b64 s[18:19], s[4:5], s[16:17]
+; GFX10-NEXT:    s_ashr_i32 s16, s3, 31
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT:    s_add_u32 s6, s10, s12
-; GFX10-NEXT:    s_addc_u32 s7, s11, s12
-; GFX10-NEXT:    s_add_u32 s10, s14, s16
+; GFX10-NEXT:    s_add_u32 s14, s14, s12
+; GFX10-NEXT:    s_addc_u32 s15, s15, s12
+; GFX10-NEXT:    s_add_u32 s2, s2, s16
 ; GFX10-NEXT:    s_mov_b32 s17, s16
-; GFX10-NEXT:    s_addc_u32 s11, s15, s16
+; GFX10-NEXT:    s_addc_u32 s3, s3, s16
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    s_xor_b64 s[10:11], s[10:11], s[16:17]
+; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
 ; GFX10-NEXT:    s_mov_b32 s13, s12
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_xor_b64 s[14:15], s[6:7], s[12:13]
+; GFX10-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1941,62 +1941,62 @@
 ; GFX10-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v4
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s3, s20, v6, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s20, v6, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v8, s21, v6
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v4
-; GFX10-NEXT:    s_sub_u32 s3, 0, s10
-; GFX10-NEXT:    s_subb_u32 s6, 0, s11
+; GFX10-NEXT:    s_sub_u32 s5, 0, s2
+; GFX10-NEXT:    s_subb_u32 s22, 0, s3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX10-NEXT:    v_mul_lo_u32 v9, s3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v9, s5, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v1, v7, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v5, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v6, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s7, s3, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s6, v4
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s23, s5, v4, 0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s22, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v5, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v13, v5, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v14, v6, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v9, v8
-; GFX10-NEXT:    v_add_co_u32 v10, s7, v10, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v0, s7, v13, v0
+; GFX10-NEXT:    v_add_co_u32 v10, s23, v10, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v0, s23, v13, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v8, v3, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s23
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v4, v2
-; GFX10-NEXT:    v_add_co_u32 v10, s7, v10, v11
+; GFX10-NEXT:    v_add_co_u32 v10, s23, v10, v11
 ; GFX10-NEXT:    v_mul_hi_u32 v9, v4, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v3, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v0, s7, v0, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v0, s23, v0, v14
 ; GFX10-NEXT:    v_mul_lo_u32 v14, v3, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s23
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, v12, v10
-; GFX10-NEXT:    v_add_co_u32 v8, s7, v8, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v8, s23, v8, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
 ; GFX10-NEXT:    v_mul_hi_u32 v16, v4, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, v13, v11
-; GFX10-NEXT:    v_add_co_u32 v1, s7, v14, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v0, s7, v0, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v8, s7, v8, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v9, s7, v1, v16
+; GFX10-NEXT:    v_add_co_u32 v1, s23, v14, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v0, s23, v0, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v8, s23, v8, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v9, s23, v1, v16
 ; GFX10-NEXT:    v_add3_u32 v7, v11, v10, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s23
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, v12, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v3, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s7, s20, v6, 0
-; GFX10-NEXT:    v_add_co_u32 v7, s7, v9, v8
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s20, v6, 0
+; GFX10-NEXT:    v_add_co_u32 v7, s23, v9, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v9, s21, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s20, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s23
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v7
 ; GFX10-NEXT:    v_add3_u32 v2, v10, v8, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v8, v5, v0
@@ -2005,74 +2005,73 @@
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v5, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v7
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s7, s3, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v4
-; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s20, s5, v4, 0
+; GFX10-NEXT:    v_mul_lo_u32 v9, s22, v4
+; GFX10-NEXT:    v_mul_lo_u32 v11, s5, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v13, v5, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v14, v6, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX10-NEXT:    v_add_co_u32 v8, s3, v8, v12
+; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v12
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v3, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v16, v4, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v11, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v0, s3, v13, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v8, s3, v8, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v0, s3, v0, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v13, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
 ; GFX10-NEXT:    v_mul_lo_u32 v12, v4, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v3, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v13, v3, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, v11, v10
 ; GFX10-NEXT:    v_mul_hi_u32 v9, v4, v2
-; GFX10-NEXT:    v_add_co_u32 v0, s3, v0, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v11, s3, v15, v12
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v11, s5, v15, v12
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v0
 ; GFX10-NEXT:    v_add3_u32 v7, v10, v8, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v1, s3, v13, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v13, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v8, s3, v11, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v1, s3, v1, v9
+; GFX10-NEXT:    v_add_co_u32 v8, s5, v11, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v9
 ; GFX10-NEXT:    v_mul_lo_u32 v7, s1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v9, s0, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v10, s1, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s1, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, v12, v8
 ; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v5
-; GFX10-NEXT:    v_add_co_u32 v7, s3, v7, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v10, s3, v11, v10
-; GFX10-NEXT:    v_add_co_u32 v0, s6, v7, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v10, s3, v10, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v7, s5, v7, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v10, s5, v11, v10
+; GFX10-NEXT:    v_add_co_u32 v0, s20, v7, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v10, s5, v10, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v9, v0
-; GFX10-NEXT:    v_add_co_u32 v8, s3, v1, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v8, s5, v1, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v11
-; GFX10-NEXT:    v_add_co_u32 v9, s3, v10, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v9, s5, v10, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s5
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v3, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v13, v6
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v8
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v0, v5
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_mul_hi_u32 v8, s14, v4
 ; GFX10-NEXT:    v_add3_u32 v2, v6, v1, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s3, s8, v9, 0
-; GFX10-NEXT:    v_mul_lo_u32 v6, s9, v9
-; GFX10-NEXT:    v_mul_lo_u32 v7, s8, v5
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v9, 0
+; GFX10-NEXT:    v_mul_lo_u32 v6, s7, v9
+; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s15, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v4, s15, v4
@@ -2084,23 +2083,23 @@
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v12, s1, v1
 ; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, s0, v0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v14, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s9, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v13
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v13, s8
+; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v13, s6
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v14
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s9, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v14
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v12
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v15
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v19, s0, v6, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v20, s0, 0, v7, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v14
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v15
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v18, v17, s0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, v3, v10
 ; GFX10-NEXT:    v_mul_hi_u32 v10, s14, v2
@@ -2117,14 +2116,14 @@
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v8, v10
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_sub_co_u32 v8, s0, v12, s8
+; GFX10-NEXT:    v_sub_co_u32 v8, s0, v12, s6
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, s0, 0, v0, s0
 ; GFX10-NEXT:    v_add3_u32 v2, v3, v1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v19, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v20, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s10, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v7, s10, v2
-; GFX10-NEXT:    v_mul_lo_u32 v11, s11, v4
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v4, 0
+; GFX10-NEXT:    v_mul_lo_u32 v7, s2, v2
+; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v17
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
 ; GFX10-NEXT:    v_mov_b32_e32 v16, 0
@@ -2139,33 +2138,33 @@
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s15, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s18, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v9
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v9
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s19, v5
-; GFX10-NEXT:    v_xor_b32_e32 v6, s2, v6
+; GFX10-NEXT:    v_xor_b32_e32 v6, s4, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, vcc_lo, s11, v1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, vcc_lo, s3, v1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s10
+; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v10, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s18
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v3, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v9
-; GFX10-NEXT:    v_xor_b32_e32 v3, s2, v7
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v9
+; GFX10-NEXT:    v_xor_b32_e32 v3, s4, v7
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v13
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v12
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v14, s0, v4, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v13
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s0
 ; GFX10-NEXT:    v_add_co_u32 v11, s0, v14, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v15, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_sub_co_u32 v7, s0, v12, s10
+; GFX10-NEXT:    v_sub_co_u32 v7, s0, v12, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v14, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
@@ -2177,9 +2176,9 @@
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s0
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[16:17]
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v3, s2
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v3, s4
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v6, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v6, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v6, s1, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v8, s12, v8
 ; GFX10-NEXT:    v_xor_b32_e32 v7, s12, v7
@@ -2187,9 +2186,8 @@
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v8, s12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7]
+; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[8:9]
+; GFX10-NEXT:    global_store_dwordx4 v16, v[4:7], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -985,8 +985,8 @@
 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: udivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s12
@@ -1255,7 +1255,7 @@
 ;
 ; GFX9-LABEL: udivrem_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x20
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s13
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s12
@@ -1264,7 +1264,7 @@
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1325,6 +1325,7 @@
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
@@ -1510,14 +1511,13 @@
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v13, v20, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s[0:1]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
 ; GFX9-NEXT:    global_store_dwordx4 v0, v[5:8], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x20
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s13
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
@@ -1616,11 +1616,11 @@
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v5, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v11, v4, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v14, v5, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v16, v6, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v3, v6, v3
+; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
 ; GFX10-NEXT:    v_add_co_u32 v10, s0, v12, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v11, v0
@@ -1642,65 +1642,66 @@
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, v11, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v15, v7
+; GFX10-NEXT:    v_mul_hi_u32 v3, v6, v3
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v5, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v16, v10
 ; GFX10-NEXT:    v_add3_u32 v1, v11, v9, v1
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v16, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s8, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v4, s9, v0
+; GFX10-NEXT:    v_mul_hi_u32 v5, s8, v0
 ; GFX10-NEXT:    v_add3_u32 v3, v10, v7, v3
-; GFX10-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v7, s8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v10, s9, v1
+; GFX10-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v9, s9, v1
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v6, v3, vcc_lo
 ; GFX10-NEXT:    v_mul_hi_u32 v6, s8, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v10, v0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v9, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v2
+; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v7, v4
+; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX10-NEXT:    v_mul_lo_u32 v10, s11, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v8, s10, v2
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GFX10-NEXT:    v_mul_hi_u32 v11, s10, v3
+; GFX10-NEXT:    v_mul_lo_u32 v9, s11, v3
+; GFX10-NEXT:    v_mul_hi_u32 v10, s10, v3
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s12, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v12, s13, v4
-; GFX10-NEXT:    v_mul_lo_u32 v13, s12, v5
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v10, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s13, v4
+; GFX10-NEXT:    v_mul_lo_u32 v12, s12, v5
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v9, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v11
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v13, v12
+; GFX10-NEXT:    v_add3_u32 v1, v1, v12, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v7, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v8
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v9, v8
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s9, v1
-; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, s8, v0
+; GFX10-NEXT:    v_sub_co_u32 v9, vcc_lo, s8, v0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s0, s9, v1, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s12, v10
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s12, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v10, s12
+; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v9, s12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v12, s0, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v11
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v0, vcc_lo
@@ -1747,34 +1748,33 @@
 ; GFX10-NEXT:    v_sub_co_u32 v14, s0, v7, s14
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s15, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s1
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s1
 ; GFX10-NEXT:    v_add_co_u32 v16, s1, v6, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s1, 0, v3, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s1
-; GFX10-NEXT:    v_add_co_u32 v10, s1, v16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s1
+; GFX10-NEXT:    v_add_co_u32 v9, s1, v16, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s1, 0, v17, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
 ; GFX10-NEXT:    v_sub_co_u32 v8, s1, v14, s14
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v16, v10, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v16, v9, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v17, v18, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v14, v8, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, v15, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v10, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v9, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v16, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v13, v14, s1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -13,16 +13,17 @@
   ; GFX90A-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg3.kernarg.offset.align.down, align 8, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
-  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr20, 0, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr33, 8, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
@@ -32,7 +33,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.1.bb103:
   ; GFX90A-NEXT:   successors: %bb.58(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FC, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -45,10 +46,10 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr22, $sgpr20_sgpr21, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr20 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $sgpr23 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr23 = IMPLICIT_DEF
@@ -58,7 +59,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3.Flow17:
   ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.57(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr20, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
@@ -66,7 +67,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
   ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec
@@ -200,7 +201,7 @@
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit $scc
+  ; GFX90A-NEXT:   renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2 + 4, target-flags(amdgpu-gotprel32-hi) @f2 + 12, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_LOAD_DWORDX2_IMM killed renamable $sgpr12_sgpr13, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; GFX90A-NEXT:   $sgpr12 = COPY killed renamable $sgpr14
@@ -365,7 +366,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.35.bb20:
   ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
@@ -412,7 +413,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.37.bb27:
   ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
@@ -463,7 +464,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.39.bb34:
   ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.40(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
@@ -512,7 +513,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
   ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -564,10 +565,10 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.43.bb55:
   ; GFX90A-NEXT:   successors: %bb.48(0x40000000), %bb.44(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr33, 16, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GFX90A-NEXT:   S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
@@ -614,7 +615,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.46.bb48:
   ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -665,7 +666,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.48.bb63:
   ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
@@ -679,7 +680,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.50.bb68:
   ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
@@ -707,13 +708,13 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.52.bb80:
   ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 killed renamable $sgpr33, 65560, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr9, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.59, implicit $scc
+  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.59, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.53:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
@@ -736,7 +737,7 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.54.bb73:
   ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
@@ -774,9 +775,9 @@
   ; GFX90A-NEXT:   renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.3, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr58, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr59, killed $vgpr5, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec
@@ -788,9 +789,9 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.57:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr20:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr20, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
@@ -825,21 +826,20 @@
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.58.bb105:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FC, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg3.kernarg.offset.align.down + 16, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.3, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr20 = S_MOV_B32 0
+  ; GFX90A-NEXT:   renamable $sgpr23 = S_MOV_B32 0
   ; GFX90A-NEXT:   renamable $sgpr17 = S_MOV_B32 0
   ; GFX90A-NEXT:   S_BRANCH %bb.3
   ; GFX90A-NEXT: {{  $}}
@@ -986,13 +986,13 @@
   ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr29, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.3, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.4, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.3, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.3, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr29, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -6,24 +6,26 @@
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_mov_b64 s[26:27], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[24:25], s[0:1]
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dword s6, s[4:5], 0x4
 ; CHECK-NEXT:    s_add_u32 s24, s24, s7
 ; CHECK-NEXT:    s_addc_u32 s25, s25, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s0, 8
+; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
+; CHECK-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; CHECK-NEXT:    s_bitcmp1_b32 s2, 8
 ; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s0, 16
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    s_bitcmp1_b32 s2, 16
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 24
 ; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; CHECK-NEXT:    s_xor_b64 s[4:5], s[8:9], -1
 ; CHECK-NEXT:    s_bitcmp1_b32 s1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; CHECK-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s1, 8
+; CHECK-NEXT:    s_bitcmp1_b32 s6, 8
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[16:17]
 ; CHECK-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v0
 ; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -25,79 +25,70 @@
 ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-UNALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
-; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-UNALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-UNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_load_2xi16_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX9-FLASTSCR:       ; %bb.0:
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLASTSCR-NEXT:    v_add_u32_e32 v1, 2, v0
-; GFX9-FLASTSCR-NEXT:    scratch_load_ushort v2, v0, off
-; GFX9-FLASTSCR-NEXT:    scratch_load_ushort v3, v1, off
+; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-FLASTSCR-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; GFX9-FLASTSCR-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-FLASTSCR-NEXT:    v_and_or_b32 v0, v0, s0, v1
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_load_2xi16_align2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_ushort v1, v0, s[0:3], 0 offen
-; GFX10-NEXT:    buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX10-FLASTSCR:       ; %bb.0:
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FLASTSCR-NEXT:    v_add_nc_u32_e32 v1, 2, v0
-; GFX10-FLASTSCR-NEXT:    s_clause 0x1
-; GFX10-FLASTSCR-NEXT:    scratch_load_ushort v2, v0, off
-; GFX10-FLASTSCR-NEXT:    scratch_load_ushort v3, v1, off
+; GFX10-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; GFX10-FLASTSCR-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-FLASTSCR-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_load_2xi16_align2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v0, v0, off
-; GFX11-NEXT:    scratch_load_u16 v1, v1, off
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX11-FLASTSCR:       ; %bb.0:
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLASTSCR-NEXT:    v_add_nc_u32_e32 v1, 2, v0
-; GFX11-FLASTSCR-NEXT:    s_clause 0x1
-; GFX11-FLASTSCR-NEXT:    scratch_load_u16 v0, v0, off
-; GFX11-FLASTSCR-NEXT:    scratch_load_u16 v1, v1, off
+; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FLASTSCR-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-FLASTSCR-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLASTSCR-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 2
@@ -125,32 +116,24 @@
 ; GFX7-UNALIGNED-LABEL: private_store_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v3, 1
-; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 2
-; GFX7-UNALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v1
-; GFX7-UNALIGNED-NEXT:    buffer_store_short v3, v1, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX7-UNALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_store_2xi16_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2:
 ; GFX9-FLASTSCR:       ; %bb.0:
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9-FLASTSCR-NEXT:    v_add_u32_e32 v0, 2, v1
-; GFX9-FLASTSCR-NEXT:    scratch_store_short v1, v2, off
-; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v1, 2
-; GFX9-FLASTSCR-NEXT:    scratch_store_short v0, v1, off
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -158,10 +141,8 @@
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, 2
-; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX10-NEXT:    buffer_store_short v2, v1, s[0:3], 0 offen offset:2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -169,11 +150,8 @@
 ; GFX10-FLASTSCR:       ; %bb.0:
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 1
-; GFX10-FLASTSCR-NEXT:    v_add_nc_u32_e32 v2, 2, v1
-; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v3, 2
-; GFX10-FLASTSCR-NEXT:    scratch_store_short v1, v0, off
-; GFX10-FLASTSCR-NEXT:    scratch_store_short v2, v3, off
+; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX10-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
 ; GFX10-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -181,11 +159,8 @@
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b16 v1, v0, off
-; GFX11-NEXT:    scratch_store_b16 v2, v3, off
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX11-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -193,11 +168,8 @@
 ; GFX11-FLASTSCR:       ; %bb.0:
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLASTSCR-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2
-; GFX11-FLASTSCR-NEXT:    v_add_nc_u32_e32 v2, 2, v1
-; GFX11-FLASTSCR-NEXT:    s_clause 0x1
-; GFX11-FLASTSCR-NEXT:    scratch_store_b16 v1, v0, off
-; GFX11-FLASTSCR-NEXT:    scratch_store_b16 v2, v3, off
+; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3220,19 +3220,18 @@
 define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) {
 ; SI-LABEL: s_fneg_select_infloop_regression_v2f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitcmp1_b32 s4, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v2, -v1, v0, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
-; SI-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[4:5]
+; SI-NEXT:    s_bitcmp1_b32 s6, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v2, -v1, v0, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3240,19 +3239,18 @@
 ;
 ; VI-LABEL: s_fneg_select_infloop_regression_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    v_bfrev_b32_e32 v0, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s4, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, -v1, v0, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
-; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
-; VI-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[4:5]
+; VI-NEXT:    s_bitcmp1_b32 s6, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v2, -v1, v0, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
+; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -110,13 +110,13 @@
 ; GCN-LABEL: float8_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0x64
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x64
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_add_u32 s0, s2, 16
-; GCN-NEXT:    s_mov_b32 m0, s1
-; GCN-NEXT:    s_addc_u32 s1, s3, 0
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_add_u32 s2, s0, 16
+; GCN-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    v_mov_b32_e32 v3, s7
@@ -124,13 +124,13 @@
 ; GCN-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-NEXT:    v_mov_b32_e32 v6, s10
 ; GCN-NEXT:    v_mov_b32_e32 v7, s11
-; GCN-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
-; GCN-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -497,42 +497,38 @@
 define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; SI-NEXT:    s_load_dword s8, s[4:5], 0x4
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    s_cmp_lg_u32 s8, 1
+; SI-NEXT:    s_cmp_lg_u32 s2, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_lg_u32 s8, 0
+; SI-NEXT:    s_cmp_lg_u32 s2, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    s_load_dword s8, s[4:5], 0x10
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_cmp_lg_u32 s8, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 1
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_cmp_lg_u32 s8, 0
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -658,8 +654,8 @@
 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v8f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x40a00000
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
@@ -681,8 +677,8 @@
 ;
 ; VI-LABEL: dynamic_insertelement_v8f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
 ; VI-NEXT:    v_mov_b32_e32 v8, 0x40a00000
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
@@ -1022,37 +1018,33 @@
 define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; SI-NEXT:    s_load_dword s8, s[4:5], 0x4
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_cmp_lg_u32 s8, 1
-; SI-NEXT:    s_cselect_b32 s0, s3, 5
-; SI-NEXT:    s_cmp_lg_u32 s8, 0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b32 s1, s2, 5
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cmp_lg_u32 s2, 1
+; SI-NEXT:    s_cselect_b32 s1, s1, 5
+; SI-NEXT:    s_cmp_lg_u32 s2, 0
+; SI-NEXT:    s_cselect_b32 s0, s0, 5
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    s_load_dword s8, s[4:5], 0x10
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_cmp_lg_u32 s8, 1
-; VI-NEXT:    s_cselect_b32 s0, s3, 5
-; VI-NEXT:    s_cmp_lg_u32 s8, 0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_cselect_b32 s1, s2, 5
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_cmp_lg_u32 s2, 1
+; VI-NEXT:    s_cselect_b32 s1, s1, 5
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_cselect_b32 s0, s0, 5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
@@ -1162,8 +1154,8 @@
 ; SI-LABEL: dynamic_insertelement_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
-; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1175,7 +1167,7 @@
 ; SI-NEXT:    v_mov_b32_e32 v5, s13
 ; SI-NEXT:    v_mov_b32_e32 v6, s14
 ; SI-NEXT:    v_mov_b32_e32 v7, s15
-; SI-NEXT:    s_mov_b32 m0, s6
+; SI-NEXT:    s_mov_b32 m0, s4
 ; SI-NEXT:    v_movreld_b32_e32 v0, 5
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -1184,8 +1176,8 @@
 ; VI-LABEL: dynamic_insertelement_v8i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1197,7 +1189,7 @@
 ; VI-NEXT:    v_mov_b32_e32 v5, s13
 ; VI-NEXT:    v_mov_b32_e32 v6, s14
 ; VI-NEXT:    v_mov_b32_e32 v7, s15
-; VI-NEXT:    s_mov_b32 m0, s6
+; VI-NEXT:    s_mov_b32 m0, s4
 ; VI-NEXT:    v_movreld_b32_e32 v0, 5
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
@@ -279,9 +279,10 @@
 ; Make sure we don't think the alignment will increase if the base address isn't an alloca
 define void @private_store_2xi16_align2_not_alloca(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; CHECK-LABEL: @private_store_2xi16_align2_not_alloca(
-; CHECK-NEXT:    [[GEP_R:%.*]] = getelementptr i16, ptr addrspace(5) [[R:%.*]], i32 1
-; CHECK-NEXT:    store i16 1, ptr addrspace(5) [[R]], align 2
-; CHECK-NEXT:    store i16 2, ptr addrspace(5) [[GEP_R]], align 2
+; ALIGNED-NEXT:  [[GEP_R:%.*]] = getelementptr i16, ptr addrspace(5) [[R:%.*]], i32 1
+; ALIGNED-NEXT:  store i16 1, ptr addrspace(5) [[R]], align 2
+; ALIGNED-NEXT:  store i16 2, ptr addrspace(5) [[GEP_R]], align 2
+; UNALIGNED-NEXT:store <2 x i16>
 ; CHECK-NEXT:    ret void
 ;
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i32 1
@@ -309,11 +310,12 @@
 
 define i32 @private_load_2xi16_align2_not_alloca(ptr addrspace(5) %p) #0 {
 ; CHECK-LABEL: @private_load_2xi16_align2_not_alloca(
-; CHECK-NEXT:    [[GEP_P:%.*]] = getelementptr i16, ptr addrspace(5) [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P_0:%.*]] = load i16, ptr addrspace(5) [[P]], align 2
-; CHECK-NEXT:    [[P_1:%.*]] = load i16, ptr addrspace(5) [[GEP_P]], align 2
-; CHECK-NEXT:    [[ZEXT_0:%.*]] = zext i16 [[P_0]] to i32
-; CHECK-NEXT:    [[ZEXT_1:%.*]] = zext i16 [[P_1]] to i32
+; ALIGNED-NEXT:  [[GEP_P:%.*]] = getelementptr i16, ptr addrspace(5) [[P:%.*]], i64 1
+; ALIGNED-NEXT:  [[P_0:%.*]] = load i16, ptr addrspace(5) [[P]], align 2
+; ALIGNED-NEXT:  [[P_1:%.*]] = load i16, ptr addrspace(5) [[GEP_P]], align 2
+; UNALIGNED-NEXT:load <2 x i16>
+; CHECK:         [[ZEXT_0:%.*]] = zext i16
+; CHECK-NEXT:    [[ZEXT_1:%.*]] = zext i16
 ; CHECK-NEXT:    [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
 ; CHECK-NEXT:    ret i32 [[OR]]
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
@@ -85,21 +85,14 @@
   ret float %x
 }
 
-; Here we have four stores, with an aliasing load before the last one.  We
-; could vectorize two of the stores before the load (although we currently
-; don't), but the important thing is that we *don't* sink the store to
-; a[idx + 1] below the load.
+; Here we have four stores, with an aliasing load before the last one.  We can
+; vectorize three of the stores before the load, but the important thing is that
+; we *don't* sink the store to a[idx + 1] below the load.
 ;
 ; CHECK-LABEL: @insert_store_point_alias_ooo
-; CHECK: store float
-; CHECK-SAME: %a.idx.3
-; CHECK: store float
-; CHECK-SAME: %a.idx.1
-; CHECK: store float
-; CHECK-SAME: %a.idx.2
+; CHECK: store <3 x float>{{.*}} %a.idx.1
 ; CHECK: load float, ptr addrspace(1) %a.idx.2
-; CHECK: store float
-; CHECK-SAME: %a.idx
+; CHECK: store float{{.*}} %a.idx
 define float @insert_store_point_alias_ooo(ptr addrspace(1) nocapture %a, i64 %idx) {
   %a.idx = getelementptr inbounds float, ptr addrspace(1) %a, i64 %idx
   %a.idx.1 = getelementptr inbounds float, ptr addrspace(1) %a.idx, i64 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
@@ -57,10 +57,17 @@
 }
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2(
-; ALL: store i32
-; ALL: store i32
-; ALL: store i32
-; ALL: store i32
+; ALIGNED: store i32
+; ALIGNED: store i32
+; ALIGNED: store i32
+; ALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT16-UNALIGNED: store <4 x i32>
 define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(ptr addrspace(5) %out) #0 {
   %out.gep.1 = getelementptr i32, ptr addrspace(5) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(5) %out, i32 2
@@ -117,8 +124,9 @@
 }
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2(
-; ALL: store i16
-; ALL: store i16
+; ALIGNED: store i16
+; ALIGNED: store i16
+; UNALIGNED: store <2 x i16>
 define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(ptr addrspace(5) %out) #0 {
   %out.gep.1 = getelementptr i16, ptr addrspace(5) %out, i32 1
 
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
@@ -26,25 +26,17 @@
   ret void
 }
 
-; Check adjiacent memory locations are properly matched and the
+; Check adjacent memory locations are properly matched and the
 ; longest chain vectorized
 
 ; GCN-LABEL: @interleave_get_longest
 
-; GFX7: load <2 x i32>
-; GFX7: load i32
-; GFX7: store <2 x i32> zeroinitializer
-; GFX7: load i32
-; GFX7: load <2 x i32>
-; GFX7: load i32
-; GFX7: load i32
-
-; GFX9: load <4 x i32>
-; GFX9: load i32
-; GFX9: store <2 x i32> zeroinitializer
-; GFX9: load i32
-; GFX9: load i32
-; GFX9: load i32
+; GCN: load <2 x i32>{{.*}} %tmp1
+; GCN: store <2 x i32> zeroinitializer{{.*}} %tmp1
+; GCN: load <2 x i32>{{.*}} %tmp2
+; GCN: load <2 x i32>{{.*}} %tmp4
+; GCN: load i32{{.*}} %tmp5
+; GCN: load i32{{.*}} %tmp5
 
 define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
   %a1 = add i32 %arg, 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
@@ -42,6 +42,54 @@
   ret void
 }
 
+; CHECK-LABEL: @merge_ptr_i32(
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+define amdgpu_kernel void @merge_ptr_i32(ptr addrspace(3) nocapture %a, ptr addrspace(3) nocapture readonly %b) #0 {
+entry:
+  %a.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 0
+  %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 1
+  %a.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 2
+
+  %b.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 0
+  %b.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 1
+  %b.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 2
+
+  %ld.0 = load i32, ptr addrspace(3) %b.0, align 16
+  %ld.1 = load ptr addrspace(3), ptr addrspace(3) %b.1, align 4
+  %ld.2 = load <2 x i32>, ptr addrspace(3) %b.2, align 8
+
+  store i32 0, ptr addrspace(3) %a.0, align 16
+  store ptr addrspace(3) null, ptr addrspace(3) %a.1, align 4
+  store <2 x i32> <i32 0, i32 0>, ptr addrspace(3) %a.2, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_ptr_i32_vec_first(
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+define amdgpu_kernel void @merge_ptr_i32_vec_first(ptr addrspace(3) nocapture %a, ptr addrspace(3) nocapture readonly %b) #0 {
+entry:
+  %a.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 0
+  %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 2
+  %a.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 3
+
+  %b.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 0
+  %b.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 2
+  %b.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 3
+
+  %ld.0 = load <2 x i32>, ptr addrspace(3) %b.0, align 16
+  %ld.1 = load ptr addrspace(3), ptr addrspace(3) %b.1, align 8
+  %ld.2 = load i32, ptr addrspace(3) %b.2, align 4
+
+  store <2 x i32> <i32 0, i32 0>, ptr addrspace(3) %a.0, align 16
+  store ptr addrspace(3) null, ptr addrspace(3) %a.1, align 8
+  store i32 0, ptr addrspace(3) %a.2, align 4
+
+  ret void
+}
+
 ; CHECK-LABEL: @merge_load_i64_ptr64(
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
@@ -82,7 +82,7 @@
   %a.ascast = addrspacecast ptr addrspace(5) %p to ptr
   %b.ascast = addrspacecast ptr addrspace(5) %gep2 to ptr
   %tmp1 = load i8, ptr %a.ascast, align 1
-  %tmp2 = load i8, ptr %b.ascast, align 1
+  %tmp2 = load i8, ptr %b.ascast, align 2
   unreachable
 }
 
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll
@@ -1,10 +1,10 @@
 ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
 
 define void @ldg_f16(ptr nocapture align 16 %rd0) {
-  %load1 = load <2 x half>, ptr %rd0, align 4
+  %load1 = load <2 x half>, ptr %rd0, align 16
   %p1 = fcmp ogt <2 x half> %load1, zeroinitializer
   %s1 = select <2 x i1> %p1, <2 x half> %load1, <2 x half> zeroinitializer
-  store <2 x half> %s1, ptr %rd0, align 4
+  store <2 x half> %s1, ptr %rd0, align 16
   %in2 = getelementptr half, ptr %rd0, i64 2
   %load2 = load <2 x half>, ptr %in2, align 4
   %p2 = fcmp ogt <2 x half> %load2, zeroinitializer
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll
@@ -0,0 +1,1136 @@
+; This is an end-to-end test that checks that LSV succeeds at vectorizing a
+; large program with many loads.
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s > %t
+; RUN: grep 'load i8' < %t | count 18
+; RUN: grep 'load <2 x i8>' < %t | count 9
+; RUN: grep 'load <4 x i8>' < %t | count 27
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+declare float @llvm.ceil.f32(float) #0
+declare i32 @llvm.smax.i32(i32, i32) #0
+declare i32 @llvm.umin.i32(i32, i32) #0
+
+define void @many_loads(ptr noalias readonly align 128 dereferenceable(5111808) %arg0, ptr noalias nocapture readonly align 128 dereferenceable(29952) %arg1, ptr noalias nocapture readonly align 128 dereferenceable(2664) %arg2, ptr noalias nocapture readonly align 128 dereferenceable(888) %arg3, ptr noalias nocapture writeonly align 128 dereferenceable(17731584) %arg4) local_unnamed_addr #1 {
+entry:
+  %arg41104 = addrspacecast ptr %arg4 to ptr addrspace(1)
+  %arg31102 = addrspacecast ptr %arg3 to ptr addrspace(1)
+  %arg21100 = addrspacecast ptr %arg2 to ptr addrspace(1)
+  %arg11098 = addrspacecast ptr %arg1 to ptr addrspace(1)
+  %arg01096 = addrspacecast ptr %arg0 to ptr addrspace(1)
+  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !140
+  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !141
+  %2 = shl nuw nsw i32 %0, 6
+  %linear_index = or i32 %2, %1
+  %linear_index_base = shl nuw nsw i32 %linear_index, 4
+  %.urem = add nsw i32 %linear_index, -554112
+  %.cmp = icmp ult i32 %linear_index, 554112
+  %3 = select i1 %.cmp, i32 %linear_index, i32 %.urem
+  %4 = urem i32 %linear_index, 2496
+  %.lhs.trunc = trunc i32 %0 to i16
+  %5 = udiv i16 %.lhs.trunc, 39
+  %.zext = zext i16 %5 to i32
+  %6 = mul nuw nsw i32 %.zext, 2496
+  %7 = add nuw nsw i32 %6, %4
+  %8 = udiv i32 %7, 222
+  %9 = mul i32 %8, 222
+  %.decomposed = sub i32 %7, %9
+  %10 = mul nuw nsw i32 %8, 3
+  %11 = mul nuw nsw i32 %.decomposed, 3
+  %12 = uitofp i32 %8 to float
+  %add.26 = fadd float %12, -1.000000e+00
+  %13 = tail call float @llvm.ceil.f32(float %add.26)
+  %14 = fcmp ole float %13, 0.000000e+00
+  %15 = select i1 %14, float 0.000000e+00, float %13
+  %16 = fcmp oge float %15, 2.493000e+03
+  %17 = select i1 %16, float 2.493000e+03, float %15
+  %.inv = fcmp ole float %17, 0xC1E0000000000000
+  %18 = select i1 %.inv, float 0xC1E0000000000000, float %17
+  %19 = fptosi float %18 to i32
+  %20 = fcmp oge float %17, 0x41E0000000000000
+  %21 = tail call i32 @llvm.smax.i32(i32 %19, i32 0)
+  %22 = tail call i32 @llvm.umin.i32(i32 %21, i32 2493)
+  %23 = select i1 %20, i32 2493, i32 %22
+  %24 = uitofp i32 %.decomposed to float
+  %add.3613 = fadd float %24, 5.000000e-01
+  %multiply.3915 = fmul float %add.3613, 0x3FE27350C0000000
+  %add.4217 = fadd float %multiply.3915, -1.500000e+00
+  %25 = tail call float @llvm.ceil.f32(float %add.4217)
+  %26 = fcmp ole float %25, 0.000000e+00
+  %27 = select i1 %26, float 0.000000e+00, float %25
+  %28 = fcmp oge float %27, 1.250000e+02
+  %29 = select i1 %28, float 1.250000e+02, float %27
+  %.inv821 = fcmp ole float %29, 0xC1E0000000000000
+  %30 = select i1 %.inv821, float 0xC1E0000000000000, float %29
+  %31 = fptosi float %30 to i32
+  %32 = fcmp oge float %29, 0x41E0000000000000
+  %33 = tail call i32 @llvm.smax.i32(i32 %31, i32 0)
+  %34 = fcmp uno float %29, 0.000000e+00
+  %35 = tail call i32 @llvm.umin.i32(i32 %33, i32 125)
+  %36 = select i1 %32, i32 125, i32 %35
+  %37 = select i1 %34, i32 0, i32 %36
+  %.lhs.trunc1053 = trunc i32 %11 to i16
+  %38 = udiv i16 %.lhs.trunc1053, 3
+  %39 = mul i16 %38, 3
+  %.decomposed1089 = sub i16 %.lhs.trunc1053, %39
+  %40 = zext i16 %38 to i64
+  %41 = zext i16 %.decomposed1089 to i64
+  %42 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %40, i64 %41
+  %43 = load float, ptr addrspace(1) %42, align 4, !invariant.load !142
+  %44 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %40
+  %45 = load float, ptr addrspace(1) %44, align 4, !invariant.load !142
+  %divide.6 = fdiv float %43, %45
+  %46 = zext i32 %10 to i64
+  %47 = getelementptr inbounds [7488 x float], ptr addrspace(1) %arg11098, i64 0, i64 %46
+  %48 = load float, ptr addrspace(1) %47, align 4, !invariant.load !142
+  %multiply.10 = fmul float %divide.6, %48
+  %49 = zext i32 %23 to i64
+  %50 = zext i32 %37 to i64
+  %51 = getelementptr inbounds [1 x [4 x [2496 x [128 x [4 x i8]]]]], ptr addrspace(1) %arg01096, i64 0, i64 0, i64 0, i64 %49, i64 %50, i64 0
+  %52 = load i8, ptr addrspace(1) %51, align 4, !invariant.load !142
+  %53 = sitofp i8 %52 to float
+  %multiply.18 = fmul float %53, 0x3FC3BF2820000000
+  %multiply.53 = fmul float %multiply.10, %multiply.18
+  %add.57.i = fadd float %multiply.53, 0.000000e+00
+  %.lhs.trunc1053.1 = add nuw nsw i16 %.lhs.trunc1053, 1
+  %54 = udiv i16 %.lhs.trunc1053.1, 3
+  %55 = mul i16 %54, 3
+  %.decomposed1090 = sub i16 %.lhs.trunc1053.1, %55
+  %56 = zext i16 %54 to i64
+  %57 = zext i16 %.decomposed1090 to i64
+  %58 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %56, i64 %57
+  %59 = load float, ptr addrspace(1) %58, align 4, !invariant.load !142
+  %60 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %56
+  %61 = load float, ptr addrspace(1) %60, align 4, !invariant.load !142
+  %divide.6.1 = fdiv float %59, %61
+  %multiply.10.1 = fmul float %divide.6.1, %48
+  %62 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 4
+  %63 = load i8, ptr addrspace(1) %62, align 4, !invariant.load !142
+  %64 = sitofp i8 %63 to float
+  %multiply.18.1 = fmul float %64, 0x3FC3BF2820000000
+  %multiply.53.1 = fmul float %multiply.10.1, %multiply.18.1
+  %add.57.i.1 = fadd float %add.57.i, %multiply.53.1
+  %.lhs.trunc1053.2 = add nuw nsw i16 %.lhs.trunc1053, 2
+  %65 = udiv i16 %.lhs.trunc1053.2, 3
+  %66 = mul i16 %65, 3
+  %.decomposed1091 = sub i16 %.lhs.trunc1053.2, %66
+  %67 = zext i16 %65 to i64
+  %68 = zext i16 %.decomposed1091 to i64
+  %69 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %67, i64 %68
+  %70 = load float, ptr addrspace(1) %69, align 4, !invariant.load !142
+  %71 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %67
+  %72 = load float, ptr addrspace(1) %71, align 4, !invariant.load !142
+  %divide.6.2 = fdiv float %70, %72
+  %multiply.10.2 = fmul float %divide.6.2, %48
+  %73 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 8
+  %74 = load i8, ptr addrspace(1) %73, align 4, !invariant.load !142
+  %75 = sitofp i8 %74 to float
+  %multiply.18.2 = fmul float %75, 0x3FC3BF2820000000
+  %multiply.53.2 = fmul float %multiply.10.2, %multiply.18.2
+  %add.57.i.2 = fadd float %add.57.i.1, %multiply.53.2
+  %76 = getelementptr inbounds float, ptr addrspace(1) %47, i64 1
+  %77 = load float, ptr addrspace(1) %76, align 4, !invariant.load !142
+  %multiply.10.3 = fmul float %divide.6, %77
+  %78 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 512
+  %79 = load i8, ptr addrspace(1) %78, align 4, !invariant.load !142
+  %80 = sitofp i8 %79 to float
+  %multiply.18.3 = fmul float %80, 0x3FC3BF2820000000
+  %multiply.53.3 = fmul float %multiply.10.3, %multiply.18.3
+  %add.57.i.3 = fadd float %add.57.i.2, %multiply.53.3
+  %multiply.10.4 = fmul float %divide.6.1, %77
+  %81 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 516
+  %82 = load i8, ptr addrspace(1) %81, align 4, !invariant.load !142
+  %83 = sitofp i8 %82 to float
+  %multiply.18.4 = fmul float %83, 0x3FC3BF2820000000
+  %multiply.53.4 = fmul float %multiply.10.4, %multiply.18.4
+  %add.57.i.4 = fadd float %add.57.i.3, %multiply.53.4
+  %multiply.10.5 = fmul float %divide.6.2, %77
+  %84 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 520
+  %85 = load i8, ptr addrspace(1) %84, align 4, !invariant.load !142
+  %86 = sitofp i8 %85 to float
+  %multiply.18.5 = fmul float %86, 0x3FC3BF2820000000
+  %multiply.53.5 = fmul float %multiply.10.5, %multiply.18.5
+  %add.57.i.5 = fadd float %add.57.i.4, %multiply.53.5
+  %87 = getelementptr inbounds float, ptr addrspace(1) %47, i64 2
+  %88 = load float, ptr addrspace(1) %87, align 4, !invariant.load !142
+  %multiply.10.6 = fmul float %divide.6, %88
+  %89 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 1024
+  %90 = load i8, ptr addrspace(1) %89, align 4, !invariant.load !142
+  %91 = sitofp i8 %90 to float
+  %multiply.18.6 = fmul float %91, 0x3FC3BF2820000000
+  %multiply.53.6 = fmul float %multiply.10.6, %multiply.18.6
+  %add.57.i.6 = fadd float %add.57.i.5, %multiply.53.6
+  %multiply.10.7 = fmul float %divide.6.1, %88
+  %92 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 1028
+  %93 = load i8, ptr addrspace(1) %92, align 4, !invariant.load !142
+  %94 = sitofp i8 %93 to float
+  %multiply.18.7 = fmul float %94, 0x3FC3BF2820000000
+  %multiply.53.7 = fmul float %multiply.10.7, %multiply.18.7
+  %add.57.i.7 = fadd float %add.57.i.6, %multiply.53.7
+  %multiply.10.8 = fmul float %divide.6.2, %88
+  %95 = getelementptr inbounds i8, ptr addrspace(1) %51, i64 1032
+  %96 = load i8, ptr addrspace(1) %95, align 4, !invariant.load !142
+  %97 = sitofp i8 %96 to float
+  %multiply.18.8 = fmul float %97, 0x3FC3BF2820000000
+  %multiply.53.8 = fmul float %multiply.10.8, %multiply.18.8
+  %add.57.i.8 = fadd float %add.57.i.7, %multiply.53.8
+  %98 = fptrunc float %add.57.i.8 to half
+  %99 = zext i32 %linear_index_base to i64
+  %100 = getelementptr half, ptr addrspace(1) %arg41104, i64 %99
+  store half %98, ptr addrspace(1) %100, align 32
+  %101 = udiv i32 %3, 222
+  %102 = mul i32 %101, 222
+  %.decomposed1092 = sub i32 %3, %102
+  %103 = mul nuw nsw i32 %101, 3
+  %104 = mul nuw nsw i32 %.decomposed1092, 3
+  %105 = uitofp i32 %101 to float
+  %add.2637 = fadd float %105, -1.000000e+00
+  %106 = tail call float @llvm.ceil.f32(float %add.2637)
+  %107 = fcmp ole float %106, 0.000000e+00
+  %108 = select i1 %107, float 0.000000e+00, float %106
+  %109 = fcmp oge float %108, 2.493000e+03
+  %110 = select i1 %109, float 2.493000e+03, float %108
+  %.inv824 = fcmp ole float %110, 0xC1E0000000000000
+  %111 = select i1 %.inv824, float 0xC1E0000000000000, float %110
+  %112 = fptosi float %111 to i32
+  %113 = fcmp oge float %110, 0x41E0000000000000
+  %114 = tail call i32 @llvm.smax.i32(i32 %112, i32 0)
+  %115 = tail call i32 @llvm.umin.i32(i32 %114, i32 2493)
+  %116 = select i1 %113, i32 2493, i32 %115
+  %117 = uitofp i32 %.decomposed1092 to float
+  %add.3660 = fadd float %117, 5.000000e-01
+  %multiply.3962 = fmul float %add.3660, 0x3FE27350C0000000
+  %add.4264 = fadd float %multiply.3962, -1.500000e+00
+  %118 = tail call float @llvm.ceil.f32(float %add.4264)
+  %119 = fcmp ole float %118, 0.000000e+00
+  %120 = select i1 %119, float 0.000000e+00, float %118
+  %121 = fcmp oge float %120, 1.250000e+02
+  %122 = select i1 %121, float 1.250000e+02, float %120
+  %.inv827 = fcmp ole float %122, 0xC1E0000000000000
+  %123 = select i1 %.inv827, float 0xC1E0000000000000, float %122
+  %124 = fptosi float %123 to i32
+  %125 = fcmp oge float %122, 0x41E0000000000000
+  %126 = tail call i32 @llvm.smax.i32(i32 %124, i32 0)
+  %127 = fcmp uno float %122, 0.000000e+00
+  %128 = tail call i32 @llvm.umin.i32(i32 %126, i32 125)
+  %129 = select i1 %125, i32 125, i32 %128
+  %130 = select i1 %127, i32 0, i32 %129
+  %.lhs.trunc1045 = trunc i32 %104 to i16
+  %131 = udiv i16 %.lhs.trunc1045, 3
+  %132 = mul i16 %131, 3
+  %.decomposed1093 = sub i16 %.lhs.trunc1045, %132
+  %133 = zext i16 %131 to i64
+  %134 = zext i16 %.decomposed1093 to i64
+  %135 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %133, i64 %134
+  %136 = load float, ptr addrspace(1) %135, align 4, !invariant.load !142
+  %137 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %133
+  %138 = load float, ptr addrspace(1) %137, align 4, !invariant.load !142
+  %divide.631 = fdiv float %136, %138
+  %139 = zext i32 %103 to i64
+  %140 = getelementptr inbounds [7488 x float], ptr addrspace(1) %arg11098, i64 0, i64 %139
+  %141 = load float, ptr addrspace(1) %140, align 4, !invariant.load !142
+  %multiply.1032 = fmul float %divide.631, %141
+  %142 = zext i32 %116 to i64
+  %143 = zext i32 %130 to i64
+  %144 = getelementptr [1 x [4 x [2496 x [128 x [4 x i8]]]]], ptr addrspace(1) %arg01096, i64 0, i64 0, i64 0, i64 %142, i64 %143, i64 0
+  %145 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1
+  %146 = load i8, ptr addrspace(1) %145, align 1, !invariant.load !142
+  %147 = sitofp i8 %146 to float
+  %multiply.1870 = fmul float %147, 0x3FC3BF2820000000
+  %multiply.5371 = fmul float %multiply.1032, %multiply.1870
+  %add.57.i914 = fadd float %multiply.5371, 0.000000e+00
+  %.lhs.trunc1045.1 = add nuw nsw i16 %.lhs.trunc1045, 1
+  %148 = udiv i16 %.lhs.trunc1045.1, 3
+  %149 = mul i16 %148, 3
+  %.decomposed1094 = sub i16 %.lhs.trunc1045.1, %149
+  %150 = zext i16 %148 to i64
+  %151 = zext i16 %.decomposed1094 to i64
+  %152 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %150, i64 %151
+  %153 = load float, ptr addrspace(1) %152, align 4, !invariant.load !142
+  %154 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %150
+  %155 = load float, ptr addrspace(1) %154, align 4, !invariant.load !142
+  %divide.631.1 = fdiv float %153, %155
+  %multiply.1032.1 = fmul float %divide.631.1, %141
+  %156 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 5
+  %157 = load i8, ptr addrspace(1) %156, align 1, !invariant.load !142
+  %158 = sitofp i8 %157 to float
+  %multiply.1870.1 = fmul float %158, 0x3FC3BF2820000000
+  %multiply.5371.1 = fmul float %multiply.1032.1, %multiply.1870.1
+  %add.57.i914.1 = fadd float %add.57.i914, %multiply.5371.1
+  %.lhs.trunc1045.2 = add nuw nsw i16 %.lhs.trunc1045, 2
+  %159 = udiv i16 %.lhs.trunc1045.2, 3
+  %160 = mul i16 %159, 3
+  %.decomposed1095 = sub i16 %.lhs.trunc1045.2, %160
+  %161 = zext i16 %159 to i64
+  %162 = zext i16 %.decomposed1095 to i64
+  %163 = getelementptr inbounds [222 x [3 x float]], ptr addrspace(1) %arg21100, i64 0, i64 %161, i64 %162
+  %164 = load float, ptr addrspace(1) %163, align 4, !invariant.load !142
+  %165 = getelementptr inbounds [222 x float], ptr addrspace(1) %arg31102, i64 0, i64 %161
+  %166 = load float, ptr addrspace(1) %165, align 4, !invariant.load !142
+  %divide.631.2 = fdiv float %164, %166
+  %multiply.1032.2 = fmul float %divide.631.2, %141
+  %167 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 9
+  %168 = load i8, ptr addrspace(1) %167, align 1, !invariant.load !142
+  %169 = sitofp i8 %168 to float
+  %multiply.1870.2 = fmul float %169, 0x3FC3BF2820000000
+  %multiply.5371.2 = fmul float %multiply.1032.2, %multiply.1870.2
+  %add.57.i914.2 = fadd float %add.57.i914.1, %multiply.5371.2
+  %170 = getelementptr inbounds float, ptr addrspace(1) %140, i64 1
+  %171 = load float, ptr addrspace(1) %170, align 4, !invariant.load !142
+  %multiply.1032.3 = fmul float %divide.631, %171
+  %172 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 513
+  %173 = load i8, ptr addrspace(1) %172, align 1, !invariant.load !142
+  %174 = sitofp i8 %173 to float
+  %multiply.1870.3 = fmul float %174, 0x3FC3BF2820000000
+  %multiply.5371.3 = fmul float %multiply.1032.3, %multiply.1870.3
+  %add.57.i914.3 = fadd float %add.57.i914.2, %multiply.5371.3
+  %multiply.1032.4 = fmul float %divide.631.1, %171
+  %175 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 517
+  %176 = load i8, ptr addrspace(1) %175, align 1, !invariant.load !142
+  %177 = sitofp i8 %176 to float
+  %multiply.1870.4 = fmul float %177, 0x3FC3BF2820000000
+  %multiply.5371.4 = fmul float %multiply.1032.4, %multiply.1870.4
+  %add.57.i914.4 = fadd float %add.57.i914.3, %multiply.5371.4
+  %multiply.1032.5 = fmul float %divide.631.2, %171
+  %178 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 521
+  %179 = load i8, ptr addrspace(1) %178, align 1, !invariant.load !142
+  %180 = sitofp i8 %179 to float
+  %multiply.1870.5 = fmul float %180, 0x3FC3BF2820000000
+  %multiply.5371.5 = fmul float %multiply.1032.5, %multiply.1870.5
+  %add.57.i914.5 = fadd float %add.57.i914.4, %multiply.5371.5
+  %181 = getelementptr inbounds float, ptr addrspace(1) %140, i64 2
+  %182 = load float, ptr addrspace(1) %181, align 4, !invariant.load !142
+  %multiply.1032.6 = fmul float %divide.631, %182
+  %183 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1025
+  %184 = load i8, ptr addrspace(1) %183, align 1, !invariant.load !142
+  %185 = sitofp i8 %184 to float
+  %multiply.1870.6 = fmul float %185, 0x3FC3BF2820000000
+  %multiply.5371.6 = fmul float %multiply.1032.6, %multiply.1870.6
+  %add.57.i914.6 = fadd float %add.57.i914.5, %multiply.5371.6
+  %multiply.1032.7 = fmul float %divide.631.1, %182
+  %186 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1029
+  %187 = load i8, ptr addrspace(1) %186, align 1, !invariant.load !142
+  %188 = sitofp i8 %187 to float
+  %multiply.1870.7 = fmul float %188, 0x3FC3BF2820000000
+  %multiply.5371.7 = fmul float %multiply.1032.7, %multiply.1870.7
+  %add.57.i914.7 = fadd float %add.57.i914.6, %multiply.5371.7
+  %multiply.1032.8 = fmul float %divide.631.2, %182
+  %189 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1033
+  %190 = load i8, ptr addrspace(1) %189, align 1, !invariant.load !142
+  %191 = sitofp i8 %190 to float
+  %multiply.1870.8 = fmul float %191, 0x3FC3BF2820000000
+  %multiply.5371.8 = fmul float %multiply.1032.8, %multiply.1870.8
+  %add.57.i914.8 = fadd float %add.57.i914.7, %multiply.5371.8
+  %192 = fptrunc float %add.57.i914.8 to half
+  %193 = getelementptr inbounds half, ptr addrspace(1) %100, i64 1
+  store half %192, ptr addrspace(1) %193, align 2
+  %194 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2
+  %195 = load i8, ptr addrspace(1) %194, align 2, !invariant.load !142
+  %196 = sitofp i8 %195 to float
+  %multiply.18122 = fmul float %196, 0x3FC3BF2820000000
+  %multiply.53123 = fmul float %multiply.1032, %multiply.18122
+  %add.57.i915 = fadd float %multiply.53123, 0.000000e+00
+  %197 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 6
+  %198 = load i8, ptr addrspace(1) %197, align 2, !invariant.load !142
+  %199 = sitofp i8 %198 to float
+  %multiply.18122.1 = fmul float %199, 0x3FC3BF2820000000
+  %multiply.53123.1 = fmul float %multiply.1032.1, %multiply.18122.1
+  %add.57.i915.1 = fadd float %add.57.i915, %multiply.53123.1
+  %200 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 10
+  %201 = load i8, ptr addrspace(1) %200, align 2, !invariant.load !142
+  %202 = sitofp i8 %201 to float
+  %multiply.18122.2 = fmul float %202, 0x3FC3BF2820000000
+  %multiply.53123.2 = fmul float %multiply.1032.2, %multiply.18122.2
+  %add.57.i915.2 = fadd float %add.57.i915.1, %multiply.53123.2
+  %203 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 514
+  %204 = load i8, ptr addrspace(1) %203, align 2, !invariant.load !142
+  %205 = sitofp i8 %204 to float
+  %multiply.18122.3 = fmul float %205, 0x3FC3BF2820000000
+  %multiply.53123.3 = fmul float %multiply.1032.3, %multiply.18122.3
+  %add.57.i915.3 = fadd float %add.57.i915.2, %multiply.53123.3
+  %206 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 518
+  %207 = load i8, ptr addrspace(1) %206, align 2, !invariant.load !142
+  %208 = sitofp i8 %207 to float
+  %multiply.18122.4 = fmul float %208, 0x3FC3BF2820000000
+  %multiply.53123.4 = fmul float %multiply.1032.4, %multiply.18122.4
+  %add.57.i915.4 = fadd float %add.57.i915.3, %multiply.53123.4
+  %209 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 522
+  %210 = load i8, ptr addrspace(1) %209, align 2, !invariant.load !142
+  %211 = sitofp i8 %210 to float
+  %multiply.18122.5 = fmul float %211, 0x3FC3BF2820000000
+  %multiply.53123.5 = fmul float %multiply.1032.5, %multiply.18122.5
+  %add.57.i915.5 = fadd float %add.57.i915.4, %multiply.53123.5
+  %212 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1026
+  %213 = load i8, ptr addrspace(1) %212, align 2, !invariant.load !142
+  %214 = sitofp i8 %213 to float
+  %multiply.18122.6 = fmul float %214, 0x3FC3BF2820000000
+  %multiply.53123.6 = fmul float %multiply.1032.6, %multiply.18122.6
+  %add.57.i915.6 = fadd float %add.57.i915.5, %multiply.53123.6
+  %215 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1030
+  %216 = load i8, ptr addrspace(1) %215, align 2, !invariant.load !142
+  %217 = sitofp i8 %216 to float
+  %multiply.18122.7 = fmul float %217, 0x3FC3BF2820000000
+  %multiply.53123.7 = fmul float %multiply.1032.7, %multiply.18122.7
+  %add.57.i915.7 = fadd float %add.57.i915.6, %multiply.53123.7
+  %218 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1034
+  %219 = load i8, ptr addrspace(1) %218, align 2, !invariant.load !142
+  %220 = sitofp i8 %219 to float
+  %multiply.18122.8 = fmul float %220, 0x3FC3BF2820000000
+  %multiply.53123.8 = fmul float %multiply.1032.8, %multiply.18122.8
+  %add.57.i915.8 = fadd float %add.57.i915.7, %multiply.53123.8
+  %221 = fptrunc float %add.57.i915.8 to half
+  %222 = getelementptr inbounds half, ptr addrspace(1) %100, i64 2
+  store half %221, ptr addrspace(1) %222, align 4
+  %223 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3
+  %224 = load i8, ptr addrspace(1) %223, align 1, !invariant.load !142
+  %225 = sitofp i8 %224 to float
+  %multiply.18174 = fmul float %225, 0x3FC3BF2820000000
+  %multiply.53175 = fmul float %multiply.1032, %multiply.18174
+  %add.57.i916 = fadd float %multiply.53175, 0.000000e+00
+  %226 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 7
+  %227 = load i8, ptr addrspace(1) %226, align 1, !invariant.load !142
+  %228 = sitofp i8 %227 to float
+  %multiply.18174.1 = fmul float %228, 0x3FC3BF2820000000
+  %multiply.53175.1 = fmul float %multiply.1032.1, %multiply.18174.1
+  %add.57.i916.1 = fadd float %add.57.i916, %multiply.53175.1
+  %229 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 11
+  %230 = load i8, ptr addrspace(1) %229, align 1, !invariant.load !142
+  %231 = sitofp i8 %230 to float
+  %multiply.18174.2 = fmul float %231, 0x3FC3BF2820000000
+  %multiply.53175.2 = fmul float %multiply.1032.2, %multiply.18174.2
+  %add.57.i916.2 = fadd float %add.57.i916.1, %multiply.53175.2
+  %232 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 515
+  %233 = load i8, ptr addrspace(1) %232, align 1, !invariant.load !142
+  %234 = sitofp i8 %233 to float
+  %multiply.18174.3 = fmul float %234, 0x3FC3BF2820000000
+  %multiply.53175.3 = fmul float %multiply.1032.3, %multiply.18174.3
+  %add.57.i916.3 = fadd float %add.57.i916.2, %multiply.53175.3
+  %235 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 519
+  %236 = load i8, ptr addrspace(1) %235, align 1, !invariant.load !142
+  %237 = sitofp i8 %236 to float
+  %multiply.18174.4 = fmul float %237, 0x3FC3BF2820000000
+  %multiply.53175.4 = fmul float %multiply.1032.4, %multiply.18174.4
+  %add.57.i916.4 = fadd float %add.57.i916.3, %multiply.53175.4
+  %238 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 523
+  %239 = load i8, ptr addrspace(1) %238, align 1, !invariant.load !142
+  %240 = sitofp i8 %239 to float
+  %multiply.18174.5 = fmul float %240, 0x3FC3BF2820000000
+  %multiply.53175.5 = fmul float %multiply.1032.5, %multiply.18174.5
+  %add.57.i916.5 = fadd float %add.57.i916.4, %multiply.53175.5
+  %241 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1027
+  %242 = load i8, ptr addrspace(1) %241, align 1, !invariant.load !142
+  %243 = sitofp i8 %242 to float
+  %multiply.18174.6 = fmul float %243, 0x3FC3BF2820000000
+  %multiply.53175.6 = fmul float %multiply.1032.6, %multiply.18174.6
+  %add.57.i916.6 = fadd float %add.57.i916.5, %multiply.53175.6
+  %244 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1031
+  %245 = load i8, ptr addrspace(1) %244, align 1, !invariant.load !142
+  %246 = sitofp i8 %245 to float
+  %multiply.18174.7 = fmul float %246, 0x3FC3BF2820000000
+  %multiply.53175.7 = fmul float %multiply.1032.7, %multiply.18174.7
+  %add.57.i916.7 = fadd float %add.57.i916.6, %multiply.53175.7
+  %247 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1035
+  %248 = load i8, ptr addrspace(1) %247, align 1, !invariant.load !142
+  %249 = sitofp i8 %248 to float
+  %multiply.18174.8 = fmul float %249, 0x3FC3BF2820000000
+  %multiply.53175.8 = fmul float %multiply.1032.8, %multiply.18174.8
+  %add.57.i916.8 = fadd float %add.57.i916.7, %multiply.53175.8
+  %250 = fptrunc float %add.57.i916.8 to half
+  %251 = getelementptr inbounds half, ptr addrspace(1) %100, i64 3
+  store half %250, ptr addrspace(1) %251, align 2
+  %252 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277952
+  %253 = load i8, ptr addrspace(1) %252, align 4, !invariant.load !142
+  %254 = sitofp i8 %253 to float
+  %multiply.18226 = fmul float %254, 0x3FC3BF2820000000
+  %multiply.53227 = fmul float %multiply.1032, %multiply.18226
+  %add.57.i917 = fadd float %multiply.53227, 0.000000e+00
+  %255 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277956
+  %256 = load i8, ptr addrspace(1) %255, align 4, !invariant.load !142
+  %257 = sitofp i8 %256 to float
+  %multiply.18226.1 = fmul float %257, 0x3FC3BF2820000000
+  %multiply.53227.1 = fmul float %multiply.1032.1, %multiply.18226.1
+  %add.57.i917.1 = fadd float %add.57.i917, %multiply.53227.1
+  %258 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277960
+  %259 = load i8, ptr addrspace(1) %258, align 4, !invariant.load !142
+  %260 = sitofp i8 %259 to float
+  %multiply.18226.2 = fmul float %260, 0x3FC3BF2820000000
+  %multiply.53227.2 = fmul float %multiply.1032.2, %multiply.18226.2
+  %add.57.i917.2 = fadd float %add.57.i917.1, %multiply.53227.2
+  %261 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278464
+  %262 = load i8, ptr addrspace(1) %261, align 4, !invariant.load !142
+  %263 = sitofp i8 %262 to float
+  %multiply.18226.3 = fmul float %263, 0x3FC3BF2820000000
+  %multiply.53227.3 = fmul float %multiply.1032.3, %multiply.18226.3
+  %add.57.i917.3 = fadd float %add.57.i917.2, %multiply.53227.3
+  %264 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278468
+  %265 = load i8, ptr addrspace(1) %264, align 4, !invariant.load !142
+  %266 = sitofp i8 %265 to float
+  %multiply.18226.4 = fmul float %266, 0x3FC3BF2820000000
+  %multiply.53227.4 = fmul float %multiply.1032.4, %multiply.18226.4
+  %add.57.i917.4 = fadd float %add.57.i917.3, %multiply.53227.4
+  %267 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278472
+  %268 = load i8, ptr addrspace(1) %267, align 4, !invariant.load !142
+  %269 = sitofp i8 %268 to float
+  %multiply.18226.5 = fmul float %269, 0x3FC3BF2820000000
+  %multiply.53227.5 = fmul float %multiply.1032.5, %multiply.18226.5
+  %add.57.i917.5 = fadd float %add.57.i917.4, %multiply.53227.5
+  %270 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278976
+  %271 = load i8, ptr addrspace(1) %270, align 4, !invariant.load !142
+  %272 = sitofp i8 %271 to float
+  %multiply.18226.6 = fmul float %272, 0x3FC3BF2820000000
+  %multiply.53227.6 = fmul float %multiply.1032.6, %multiply.18226.6
+  %add.57.i917.6 = fadd float %add.57.i917.5, %multiply.53227.6
+  %273 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278980
+  %274 = load i8, ptr addrspace(1) %273, align 4, !invariant.load !142
+  %275 = sitofp i8 %274 to float
+  %multiply.18226.7 = fmul float %275, 0x3FC3BF2820000000
+  %multiply.53227.7 = fmul float %multiply.1032.7, %multiply.18226.7
+  %add.57.i917.7 = fadd float %add.57.i917.6, %multiply.53227.7
+  %276 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278984
+  %277 = load i8, ptr addrspace(1) %276, align 4, !invariant.load !142
+  %278 = sitofp i8 %277 to float
+  %multiply.18226.8 = fmul float %278, 0x3FC3BF2820000000
+  %multiply.53227.8 = fmul float %multiply.1032.8, %multiply.18226.8
+  %add.57.i917.8 = fadd float %add.57.i917.7, %multiply.53227.8
+  %279 = fptrunc float %add.57.i917.8 to half
+  %280 = getelementptr inbounds half, ptr addrspace(1) %100, i64 4
+  store half %279, ptr addrspace(1) %280, align 8
+  %281 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277953
+  %282 = load i8, ptr addrspace(1) %281, align 1, !invariant.load !142
+  %283 = sitofp i8 %282 to float
+  %multiply.18278 = fmul float %283, 0x3FC3BF2820000000
+  %multiply.53279 = fmul float %multiply.1032, %multiply.18278
+  %add.57.i918 = fadd float %multiply.53279, 0.000000e+00
+  %284 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277957
+  %285 = load i8, ptr addrspace(1) %284, align 1, !invariant.load !142
+  %286 = sitofp i8 %285 to float
+  %multiply.18278.1 = fmul float %286, 0x3FC3BF2820000000
+  %multiply.53279.1 = fmul float %multiply.1032.1, %multiply.18278.1
+  %add.57.i918.1 = fadd float %add.57.i918, %multiply.53279.1
+  %287 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277961
+  %288 = load i8, ptr addrspace(1) %287, align 1, !invariant.load !142
+  %289 = sitofp i8 %288 to float
+  %multiply.18278.2 = fmul float %289, 0x3FC3BF2820000000
+  %multiply.53279.2 = fmul float %multiply.1032.2, %multiply.18278.2
+  %add.57.i918.2 = fadd float %add.57.i918.1, %multiply.53279.2
+  %290 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278465
+  %291 = load i8, ptr addrspace(1) %290, align 1, !invariant.load !142
+  %292 = sitofp i8 %291 to float
+  %multiply.18278.3 = fmul float %292, 0x3FC3BF2820000000
+  %multiply.53279.3 = fmul float %multiply.1032.3, %multiply.18278.3
+  %add.57.i918.3 = fadd float %add.57.i918.2, %multiply.53279.3
+  %293 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278469
+  %294 = load i8, ptr addrspace(1) %293, align 1, !invariant.load !142
+  %295 = sitofp i8 %294 to float
+  %multiply.18278.4 = fmul float %295, 0x3FC3BF2820000000
+  %multiply.53279.4 = fmul float %multiply.1032.4, %multiply.18278.4
+  %add.57.i918.4 = fadd float %add.57.i918.3, %multiply.53279.4
+  %296 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278473
+  %297 = load i8, ptr addrspace(1) %296, align 1, !invariant.load !142
+  %298 = sitofp i8 %297 to float
+  %multiply.18278.5 = fmul float %298, 0x3FC3BF2820000000
+  %multiply.53279.5 = fmul float %multiply.1032.5, %multiply.18278.5
+  %add.57.i918.5 = fadd float %add.57.i918.4, %multiply.53279.5
+  %299 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278977
+  %300 = load i8, ptr addrspace(1) %299, align 1, !invariant.load !142
+  %301 = sitofp i8 %300 to float
+  %multiply.18278.6 = fmul float %301, 0x3FC3BF2820000000
+  %multiply.53279.6 = fmul float %multiply.1032.6, %multiply.18278.6
+  %add.57.i918.6 = fadd float %add.57.i918.5, %multiply.53279.6
+  %302 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278981
+  %303 = load i8, ptr addrspace(1) %302, align 1, !invariant.load !142
+  %304 = sitofp i8 %303 to float
+  %multiply.18278.7 = fmul float %304, 0x3FC3BF2820000000
+  %multiply.53279.7 = fmul float %multiply.1032.7, %multiply.18278.7
+  %add.57.i918.7 = fadd float %add.57.i918.6, %multiply.53279.7
+  %305 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278985
+  %306 = load i8, ptr addrspace(1) %305, align 1, !invariant.load !142
+  %307 = sitofp i8 %306 to float
+  %multiply.18278.8 = fmul float %307, 0x3FC3BF2820000000
+  %multiply.53279.8 = fmul float %multiply.1032.8, %multiply.18278.8
+  %add.57.i918.8 = fadd float %add.57.i918.7, %multiply.53279.8
+  %308 = fptrunc float %add.57.i918.8 to half
+  %309 = getelementptr inbounds half, ptr addrspace(1) %100, i64 5
+  store half %308, ptr addrspace(1) %309, align 2
+  %310 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277954
+  %311 = load i8, ptr addrspace(1) %310, align 2, !invariant.load !142
+  %312 = sitofp i8 %311 to float
+  %multiply.18330 = fmul float %312, 0x3FC3BF2820000000
+  %multiply.53331 = fmul float %multiply.1032, %multiply.18330
+  %add.57.i919 = fadd float %multiply.53331, 0.000000e+00
+  %313 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277958
+  %314 = load i8, ptr addrspace(1) %313, align 2, !invariant.load !142
+  %315 = sitofp i8 %314 to float
+  %multiply.18330.1 = fmul float %315, 0x3FC3BF2820000000
+  %multiply.53331.1 = fmul float %multiply.1032.1, %multiply.18330.1
+  %add.57.i919.1 = fadd float %add.57.i919, %multiply.53331.1
+  %316 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277962
+  %317 = load i8, ptr addrspace(1) %316, align 2, !invariant.load !142
+  %318 = sitofp i8 %317 to float
+  %multiply.18330.2 = fmul float %318, 0x3FC3BF2820000000
+  %multiply.53331.2 = fmul float %multiply.1032.2, %multiply.18330.2
+  %add.57.i919.2 = fadd float %add.57.i919.1, %multiply.53331.2
+  %319 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278466
+  %320 = load i8, ptr addrspace(1) %319, align 2, !invariant.load !142
+  %321 = sitofp i8 %320 to float
+  %multiply.18330.3 = fmul float %321, 0x3FC3BF2820000000
+  %multiply.53331.3 = fmul float %multiply.1032.3, %multiply.18330.3
+  %add.57.i919.3 = fadd float %add.57.i919.2, %multiply.53331.3
+  %322 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278470
+  %323 = load i8, ptr addrspace(1) %322, align 2, !invariant.load !142
+  %324 = sitofp i8 %323 to float
+  %multiply.18330.4 = fmul float %324, 0x3FC3BF2820000000
+  %multiply.53331.4 = fmul float %multiply.1032.4, %multiply.18330.4
+  %add.57.i919.4 = fadd float %add.57.i919.3, %multiply.53331.4
+  %325 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278474
+  %326 = load i8, ptr addrspace(1) %325, align 2, !invariant.load !142
+  %327 = sitofp i8 %326 to float
+  %multiply.18330.5 = fmul float %327, 0x3FC3BF2820000000
+  %multiply.53331.5 = fmul float %multiply.1032.5, %multiply.18330.5
+  %add.57.i919.5 = fadd float %add.57.i919.4, %multiply.53331.5
+  %328 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278978
+  %329 = load i8, ptr addrspace(1) %328, align 2, !invariant.load !142
+  %330 = sitofp i8 %329 to float
+  %multiply.18330.6 = fmul float %330, 0x3FC3BF2820000000
+  %multiply.53331.6 = fmul float %multiply.1032.6, %multiply.18330.6
+  %add.57.i919.6 = fadd float %add.57.i919.5, %multiply.53331.6
+  %331 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278982
+  %332 = load i8, ptr addrspace(1) %331, align 2, !invariant.load !142
+  %333 = sitofp i8 %332 to float
+  %multiply.18330.7 = fmul float %333, 0x3FC3BF2820000000
+  %multiply.53331.7 = fmul float %multiply.1032.7, %multiply.18330.7
+  %add.57.i919.7 = fadd float %add.57.i919.6, %multiply.53331.7
+  %334 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278986
+  %335 = load i8, ptr addrspace(1) %334, align 2, !invariant.load !142
+  %336 = sitofp i8 %335 to float
+  %multiply.18330.8 = fmul float %336, 0x3FC3BF2820000000
+  %multiply.53331.8 = fmul float %multiply.1032.8, %multiply.18330.8
+  %add.57.i919.8 = fadd float %add.57.i919.7, %multiply.53331.8
+  %337 = fptrunc float %add.57.i919.8 to half
+  %338 = getelementptr inbounds half, ptr addrspace(1) %100, i64 6
+  store half %337, ptr addrspace(1) %338, align 4
+  %339 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277955
+  %340 = load i8, ptr addrspace(1) %339, align 1, !invariant.load !142
+  %341 = sitofp i8 %340 to float
+  %multiply.18382 = fmul float %341, 0x3FC3BF2820000000
+  %multiply.53383 = fmul float %multiply.1032, %multiply.18382
+  %add.57.i920 = fadd float %multiply.53383, 0.000000e+00
+  %342 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277959
+  %343 = load i8, ptr addrspace(1) %342, align 1, !invariant.load !142
+  %344 = sitofp i8 %343 to float
+  %multiply.18382.1 = fmul float %344, 0x3FC3BF2820000000
+  %multiply.53383.1 = fmul float %multiply.1032.1, %multiply.18382.1
+  %add.57.i920.1 = fadd float %add.57.i920, %multiply.53383.1
+  %345 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1277963
+  %346 = load i8, ptr addrspace(1) %345, align 1, !invariant.load !142
+  %347 = sitofp i8 %346 to float
+  %multiply.18382.2 = fmul float %347, 0x3FC3BF2820000000
+  %multiply.53383.2 = fmul float %multiply.1032.2, %multiply.18382.2
+  %add.57.i920.2 = fadd float %add.57.i920.1, %multiply.53383.2
+  %348 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278467
+  %349 = load i8, ptr addrspace(1) %348, align 1, !invariant.load !142
+  %350 = sitofp i8 %349 to float
+  %multiply.18382.3 = fmul float %350, 0x3FC3BF2820000000
+  %multiply.53383.3 = fmul float %multiply.1032.3, %multiply.18382.3
+  %add.57.i920.3 = fadd float %add.57.i920.2, %multiply.53383.3
+  %351 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278471
+  %352 = load i8, ptr addrspace(1) %351, align 1, !invariant.load !142
+  %353 = sitofp i8 %352 to float
+  %multiply.18382.4 = fmul float %353, 0x3FC3BF2820000000
+  %multiply.53383.4 = fmul float %multiply.1032.4, %multiply.18382.4
+  %add.57.i920.4 = fadd float %add.57.i920.3, %multiply.53383.4
+  %354 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278475
+  %355 = load i8, ptr addrspace(1) %354, align 1, !invariant.load !142
+  %356 = sitofp i8 %355 to float
+  %multiply.18382.5 = fmul float %356, 0x3FC3BF2820000000
+  %multiply.53383.5 = fmul float %multiply.1032.5, %multiply.18382.5
+  %add.57.i920.5 = fadd float %add.57.i920.4, %multiply.53383.5
+  %357 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278979
+  %358 = load i8, ptr addrspace(1) %357, align 1, !invariant.load !142
+  %359 = sitofp i8 %358 to float
+  %multiply.18382.6 = fmul float %359, 0x3FC3BF2820000000
+  %multiply.53383.6 = fmul float %multiply.1032.6, %multiply.18382.6
+  %add.57.i920.6 = fadd float %add.57.i920.5, %multiply.53383.6
+  %360 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278983
+  %361 = load i8, ptr addrspace(1) %360, align 1, !invariant.load !142
+  %362 = sitofp i8 %361 to float
+  %multiply.18382.7 = fmul float %362, 0x3FC3BF2820000000
+  %multiply.53383.7 = fmul float %multiply.1032.7, %multiply.18382.7
+  %add.57.i920.7 = fadd float %add.57.i920.6, %multiply.53383.7
+  %363 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 1278987
+  %364 = load i8, ptr addrspace(1) %363, align 1, !invariant.load !142
+  %365 = sitofp i8 %364 to float
+  %multiply.18382.8 = fmul float %365, 0x3FC3BF2820000000
+  %multiply.53383.8 = fmul float %multiply.1032.8, %multiply.18382.8
+  %add.57.i920.8 = fadd float %add.57.i920.7, %multiply.53383.8
+  %366 = fptrunc float %add.57.i920.8 to half
+  %367 = getelementptr inbounds half, ptr addrspace(1) %100, i64 7
+  store half %366, ptr addrspace(1) %367, align 2
+  %368 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555904
+  %369 = load i8, ptr addrspace(1) %368, align 4, !invariant.load !142
+  %370 = sitofp i8 %369 to float
+  %multiply.18434 = fmul float %370, 0x3FC3BF2820000000
+  %multiply.53435 = fmul float %multiply.1032, %multiply.18434
+  %add.57.i921 = fadd float %multiply.53435, 0.000000e+00
+  %371 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555908
+  %372 = load i8, ptr addrspace(1) %371, align 4, !invariant.load !142
+  %373 = sitofp i8 %372 to float
+  %multiply.18434.1 = fmul float %373, 0x3FC3BF2820000000
+  %multiply.53435.1 = fmul float %multiply.1032.1, %multiply.18434.1
+  %add.57.i921.1 = fadd float %add.57.i921, %multiply.53435.1
+  %374 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555912
+  %375 = load i8, ptr addrspace(1) %374, align 4, !invariant.load !142
+  %376 = sitofp i8 %375 to float
+  %multiply.18434.2 = fmul float %376, 0x3FC3BF2820000000
+  %multiply.53435.2 = fmul float %multiply.1032.2, %multiply.18434.2
+  %add.57.i921.2 = fadd float %add.57.i921.1, %multiply.53435.2
+  %377 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556416
+  %378 = load i8, ptr addrspace(1) %377, align 4, !invariant.load !142
+  %379 = sitofp i8 %378 to float
+  %multiply.18434.3 = fmul float %379, 0x3FC3BF2820000000
+  %multiply.53435.3 = fmul float %multiply.1032.3, %multiply.18434.3
+  %add.57.i921.3 = fadd float %add.57.i921.2, %multiply.53435.3
+  %380 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556420
+  %381 = load i8, ptr addrspace(1) %380, align 4, !invariant.load !142
+  %382 = sitofp i8 %381 to float
+  %multiply.18434.4 = fmul float %382, 0x3FC3BF2820000000
+  %multiply.53435.4 = fmul float %multiply.1032.4, %multiply.18434.4
+  %add.57.i921.4 = fadd float %add.57.i921.3, %multiply.53435.4
+  %383 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556424
+  %384 = load i8, ptr addrspace(1) %383, align 4, !invariant.load !142
+  %385 = sitofp i8 %384 to float
+  %multiply.18434.5 = fmul float %385, 0x3FC3BF2820000000
+  %multiply.53435.5 = fmul float %multiply.1032.5, %multiply.18434.5
+  %add.57.i921.5 = fadd float %add.57.i921.4, %multiply.53435.5
+  %386 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556928
+  %387 = load i8, ptr addrspace(1) %386, align 4, !invariant.load !142
+  %388 = sitofp i8 %387 to float
+  %multiply.18434.6 = fmul float %388, 0x3FC3BF2820000000
+  %multiply.53435.6 = fmul float %multiply.1032.6, %multiply.18434.6
+  %add.57.i921.6 = fadd float %add.57.i921.5, %multiply.53435.6
+  %389 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556932
+  %390 = load i8, ptr addrspace(1) %389, align 4, !invariant.load !142
+  %391 = sitofp i8 %390 to float
+  %multiply.18434.7 = fmul float %391, 0x3FC3BF2820000000
+  %multiply.53435.7 = fmul float %multiply.1032.7, %multiply.18434.7
+  %add.57.i921.7 = fadd float %add.57.i921.6, %multiply.53435.7
+  %392 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556936
+  %393 = load i8, ptr addrspace(1) %392, align 4, !invariant.load !142
+  %394 = sitofp i8 %393 to float
+  %multiply.18434.8 = fmul float %394, 0x3FC3BF2820000000
+  %multiply.53435.8 = fmul float %multiply.1032.8, %multiply.18434.8
+  %add.57.i921.8 = fadd float %add.57.i921.7, %multiply.53435.8
+  %395 = fptrunc float %add.57.i921.8 to half
+  %396 = getelementptr inbounds half, ptr addrspace(1) %100, i64 8
+  store half %395, ptr addrspace(1) %396, align 16
+  %397 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555905
+  %398 = load i8, ptr addrspace(1) %397, align 1, !invariant.load !142
+  %399 = sitofp i8 %398 to float
+  %multiply.18486 = fmul float %399, 0x3FC3BF2820000000
+  %multiply.53487 = fmul float %multiply.1032, %multiply.18486
+  %add.57.i922 = fadd float %multiply.53487, 0.000000e+00
+  %400 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555909
+  %401 = load i8, ptr addrspace(1) %400, align 1, !invariant.load !142
+  %402 = sitofp i8 %401 to float
+  %multiply.18486.1 = fmul float %402, 0x3FC3BF2820000000
+  %multiply.53487.1 = fmul float %multiply.1032.1, %multiply.18486.1
+  %add.57.i922.1 = fadd float %add.57.i922, %multiply.53487.1
+  %403 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555913
+  %404 = load i8, ptr addrspace(1) %403, align 1, !invariant.load !142
+  %405 = sitofp i8 %404 to float
+  %multiply.18486.2 = fmul float %405, 0x3FC3BF2820000000
+  %multiply.53487.2 = fmul float %multiply.1032.2, %multiply.18486.2
+  %add.57.i922.2 = fadd float %add.57.i922.1, %multiply.53487.2
+  %406 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556417
+  %407 = load i8, ptr addrspace(1) %406, align 1, !invariant.load !142
+  %408 = sitofp i8 %407 to float
+  %multiply.18486.3 = fmul float %408, 0x3FC3BF2820000000
+  %multiply.53487.3 = fmul float %multiply.1032.3, %multiply.18486.3
+  %add.57.i922.3 = fadd float %add.57.i922.2, %multiply.53487.3
+  %409 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556421
+  %410 = load i8, ptr addrspace(1) %409, align 1, !invariant.load !142
+  %411 = sitofp i8 %410 to float
+  %multiply.18486.4 = fmul float %411, 0x3FC3BF2820000000
+  %multiply.53487.4 = fmul float %multiply.1032.4, %multiply.18486.4
+  %add.57.i922.4 = fadd float %add.57.i922.3, %multiply.53487.4
+  %412 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556425
+  %413 = load i8, ptr addrspace(1) %412, align 1, !invariant.load !142
+  %414 = sitofp i8 %413 to float
+  %multiply.18486.5 = fmul float %414, 0x3FC3BF2820000000
+  %multiply.53487.5 = fmul float %multiply.1032.5, %multiply.18486.5
+  %add.57.i922.5 = fadd float %add.57.i922.4, %multiply.53487.5
+  %415 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556929
+  %416 = load i8, ptr addrspace(1) %415, align 1, !invariant.load !142
+  %417 = sitofp i8 %416 to float
+  %multiply.18486.6 = fmul float %417, 0x3FC3BF2820000000
+  %multiply.53487.6 = fmul float %multiply.1032.6, %multiply.18486.6
+  %add.57.i922.6 = fadd float %add.57.i922.5, %multiply.53487.6
+  %418 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556933
+  %419 = load i8, ptr addrspace(1) %418, align 1, !invariant.load !142
+  %420 = sitofp i8 %419 to float
+  %multiply.18486.7 = fmul float %420, 0x3FC3BF2820000000
+  %multiply.53487.7 = fmul float %multiply.1032.7, %multiply.18486.7
+  %add.57.i922.7 = fadd float %add.57.i922.6, %multiply.53487.7
+  %421 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556937
+  %422 = load i8, ptr addrspace(1) %421, align 1, !invariant.load !142
+  %423 = sitofp i8 %422 to float
+  %multiply.18486.8 = fmul float %423, 0x3FC3BF2820000000
+  %multiply.53487.8 = fmul float %multiply.1032.8, %multiply.18486.8
+  %add.57.i922.8 = fadd float %add.57.i922.7, %multiply.53487.8
+  %424 = fptrunc float %add.57.i922.8 to half
+  %425 = getelementptr inbounds half, ptr addrspace(1) %100, i64 9
+  store half %424, ptr addrspace(1) %425, align 2
+  %426 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555906
+  %427 = load i8, ptr addrspace(1) %426, align 2, !invariant.load !142
+  %428 = sitofp i8 %427 to float
+  %multiply.18538 = fmul float %428, 0x3FC3BF2820000000
+  %multiply.53539 = fmul float %multiply.1032, %multiply.18538
+  %add.57.i923 = fadd float %multiply.53539, 0.000000e+00
+  %429 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555910
+  %430 = load i8, ptr addrspace(1) %429, align 2, !invariant.load !142
+  %431 = sitofp i8 %430 to float
+  %multiply.18538.1 = fmul float %431, 0x3FC3BF2820000000
+  %multiply.53539.1 = fmul float %multiply.1032.1, %multiply.18538.1
+  %add.57.i923.1 = fadd float %add.57.i923, %multiply.53539.1
+  %432 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555914
+  %433 = load i8, ptr addrspace(1) %432, align 2, !invariant.load !142
+  %434 = sitofp i8 %433 to float
+  %multiply.18538.2 = fmul float %434, 0x3FC3BF2820000000
+  %multiply.53539.2 = fmul float %multiply.1032.2, %multiply.18538.2
+  %add.57.i923.2 = fadd float %add.57.i923.1, %multiply.53539.2
+  %435 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556418
+  %436 = load i8, ptr addrspace(1) %435, align 2, !invariant.load !142
+  %437 = sitofp i8 %436 to float
+  %multiply.18538.3 = fmul float %437, 0x3FC3BF2820000000
+  %multiply.53539.3 = fmul float %multiply.1032.3, %multiply.18538.3
+  %add.57.i923.3 = fadd float %add.57.i923.2, %multiply.53539.3
+  %438 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556422
+  %439 = load i8, ptr addrspace(1) %438, align 2, !invariant.load !142
+  %440 = sitofp i8 %439 to float
+  %multiply.18538.4 = fmul float %440, 0x3FC3BF2820000000
+  %multiply.53539.4 = fmul float %multiply.1032.4, %multiply.18538.4
+  %add.57.i923.4 = fadd float %add.57.i923.3, %multiply.53539.4
+  %441 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556426
+  %442 = load i8, ptr addrspace(1) %441, align 2, !invariant.load !142
+  %443 = sitofp i8 %442 to float
+  %multiply.18538.5 = fmul float %443, 0x3FC3BF2820000000
+  %multiply.53539.5 = fmul float %multiply.1032.5, %multiply.18538.5
+  %add.57.i923.5 = fadd float %add.57.i923.4, %multiply.53539.5
+  %444 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556930
+  %445 = load i8, ptr addrspace(1) %444, align 2, !invariant.load !142
+  %446 = sitofp i8 %445 to float
+  %multiply.18538.6 = fmul float %446, 0x3FC3BF2820000000
+  %multiply.53539.6 = fmul float %multiply.1032.6, %multiply.18538.6
+  %add.57.i923.6 = fadd float %add.57.i923.5, %multiply.53539.6
+  %447 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556934
+  %448 = load i8, ptr addrspace(1) %447, align 2, !invariant.load !142
+  %449 = sitofp i8 %448 to float
+  %multiply.18538.7 = fmul float %449, 0x3FC3BF2820000000
+  %multiply.53539.7 = fmul float %multiply.1032.7, %multiply.18538.7
+  %add.57.i923.7 = fadd float %add.57.i923.6, %multiply.53539.7
+  %450 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556938
+  %451 = load i8, ptr addrspace(1) %450, align 2, !invariant.load !142
+  %452 = sitofp i8 %451 to float
+  %multiply.18538.8 = fmul float %452, 0x3FC3BF2820000000
+  %multiply.53539.8 = fmul float %multiply.1032.8, %multiply.18538.8
+  %add.57.i923.8 = fadd float %add.57.i923.7, %multiply.53539.8
+  %453 = fptrunc float %add.57.i923.8 to half
+  %454 = getelementptr inbounds half, ptr addrspace(1) %100, i64 10
+  store half %453, ptr addrspace(1) %454, align 4
+  %455 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555907
+  %456 = load i8, ptr addrspace(1) %455, align 1, !invariant.load !142
+  %457 = sitofp i8 %456 to float
+  %multiply.18590 = fmul float %457, 0x3FC3BF2820000000
+  %multiply.53591 = fmul float %multiply.1032, %multiply.18590
+  %add.57.i924 = fadd float %multiply.53591, 0.000000e+00
+  %458 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555911
+  %459 = load i8, ptr addrspace(1) %458, align 1, !invariant.load !142
+  %460 = sitofp i8 %459 to float
+  %multiply.18590.1 = fmul float %460, 0x3FC3BF2820000000
+  %multiply.53591.1 = fmul float %multiply.1032.1, %multiply.18590.1
+  %add.57.i924.1 = fadd float %add.57.i924, %multiply.53591.1
+  %461 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2555915
+  %462 = load i8, ptr addrspace(1) %461, align 1, !invariant.load !142
+  %463 = sitofp i8 %462 to float
+  %multiply.18590.2 = fmul float %463, 0x3FC3BF2820000000
+  %multiply.53591.2 = fmul float %multiply.1032.2, %multiply.18590.2
+  %add.57.i924.2 = fadd float %add.57.i924.1, %multiply.53591.2
+  %464 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556419
+  %465 = load i8, ptr addrspace(1) %464, align 1, !invariant.load !142
+  %466 = sitofp i8 %465 to float
+  %multiply.18590.3 = fmul float %466, 0x3FC3BF2820000000
+  %multiply.53591.3 = fmul float %multiply.1032.3, %multiply.18590.3
+  %add.57.i924.3 = fadd float %add.57.i924.2, %multiply.53591.3
+  %467 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556423
+  %468 = load i8, ptr addrspace(1) %467, align 1, !invariant.load !142
+  %469 = sitofp i8 %468 to float
+  %multiply.18590.4 = fmul float %469, 0x3FC3BF2820000000
+  %multiply.53591.4 = fmul float %multiply.1032.4, %multiply.18590.4
+  %add.57.i924.4 = fadd float %add.57.i924.3, %multiply.53591.4
+  %470 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556427
+  %471 = load i8, ptr addrspace(1) %470, align 1, !invariant.load !142
+  %472 = sitofp i8 %471 to float
+  %multiply.18590.5 = fmul float %472, 0x3FC3BF2820000000
+  %multiply.53591.5 = fmul float %multiply.1032.5, %multiply.18590.5
+  %add.57.i924.5 = fadd float %add.57.i924.4, %multiply.53591.5
+  %473 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556931
+  %474 = load i8, ptr addrspace(1) %473, align 1, !invariant.load !142
+  %475 = sitofp i8 %474 to float
+  %multiply.18590.6 = fmul float %475, 0x3FC3BF2820000000
+  %multiply.53591.6 = fmul float %multiply.1032.6, %multiply.18590.6
+  %add.57.i924.6 = fadd float %add.57.i924.5, %multiply.53591.6
+  %476 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556935
+  %477 = load i8, ptr addrspace(1) %476, align 1, !invariant.load !142
+  %478 = sitofp i8 %477 to float
+  %multiply.18590.7 = fmul float %478, 0x3FC3BF2820000000
+  %multiply.53591.7 = fmul float %multiply.1032.7, %multiply.18590.7
+  %add.57.i924.7 = fadd float %add.57.i924.6, %multiply.53591.7
+  %479 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 2556939
+  %480 = load i8, ptr addrspace(1) %479, align 1, !invariant.load !142
+  %481 = sitofp i8 %480 to float
+  %multiply.18590.8 = fmul float %481, 0x3FC3BF2820000000
+  %multiply.53591.8 = fmul float %multiply.1032.8, %multiply.18590.8
+  %add.57.i924.8 = fadd float %add.57.i924.7, %multiply.53591.8
+  %482 = fptrunc float %add.57.i924.8 to half
+  %483 = getelementptr inbounds half, ptr addrspace(1) %100, i64 11
+  store half %482, ptr addrspace(1) %483, align 2
+  %484 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833856
+  %485 = load i8, ptr addrspace(1) %484, align 4, !invariant.load !142
+  %486 = sitofp i8 %485 to float
+  %multiply.18642 = fmul float %486, 0x3FC3BF2820000000
+  %multiply.53643 = fmul float %multiply.1032, %multiply.18642
+  %add.57.i925 = fadd float %multiply.53643, 0.000000e+00
+  %487 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833860
+  %488 = load i8, ptr addrspace(1) %487, align 4, !invariant.load !142
+  %489 = sitofp i8 %488 to float
+  %multiply.18642.1 = fmul float %489, 0x3FC3BF2820000000
+  %multiply.53643.1 = fmul float %multiply.1032.1, %multiply.18642.1
+  %add.57.i925.1 = fadd float %add.57.i925, %multiply.53643.1
+  %490 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833864
+  %491 = load i8, ptr addrspace(1) %490, align 4, !invariant.load !142
+  %492 = sitofp i8 %491 to float
+  %multiply.18642.2 = fmul float %492, 0x3FC3BF2820000000
+  %multiply.53643.2 = fmul float %multiply.1032.2, %multiply.18642.2
+  %add.57.i925.2 = fadd float %add.57.i925.1, %multiply.53643.2
+  %493 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834368
+  %494 = load i8, ptr addrspace(1) %493, align 4, !invariant.load !142
+  %495 = sitofp i8 %494 to float
+  %multiply.18642.3 = fmul float %495, 0x3FC3BF2820000000
+  %multiply.53643.3 = fmul float %multiply.1032.3, %multiply.18642.3
+  %add.57.i925.3 = fadd float %add.57.i925.2, %multiply.53643.3
+  %496 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834372
+  %497 = load i8, ptr addrspace(1) %496, align 4, !invariant.load !142
+  %498 = sitofp i8 %497 to float
+  %multiply.18642.4 = fmul float %498, 0x3FC3BF2820000000
+  %multiply.53643.4 = fmul float %multiply.1032.4, %multiply.18642.4
+  %add.57.i925.4 = fadd float %add.57.i925.3, %multiply.53643.4
+  %499 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834376
+  %500 = load i8, ptr addrspace(1) %499, align 4, !invariant.load !142
+  %501 = sitofp i8 %500 to float
+  %multiply.18642.5 = fmul float %501, 0x3FC3BF2820000000
+  %multiply.53643.5 = fmul float %multiply.1032.5, %multiply.18642.5
+  %add.57.i925.5 = fadd float %add.57.i925.4, %multiply.53643.5
+  %502 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834880
+  %503 = load i8, ptr addrspace(1) %502, align 4, !invariant.load !142
+  %504 = sitofp i8 %503 to float
+  %multiply.18642.6 = fmul float %504, 0x3FC3BF2820000000
+  %multiply.53643.6 = fmul float %multiply.1032.6, %multiply.18642.6
+  %add.57.i925.6 = fadd float %add.57.i925.5, %multiply.53643.6
+  %505 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834884
+  %506 = load i8, ptr addrspace(1) %505, align 4, !invariant.load !142
+  %507 = sitofp i8 %506 to float
+  %multiply.18642.7 = fmul float %507, 0x3FC3BF2820000000
+  %multiply.53643.7 = fmul float %multiply.1032.7, %multiply.18642.7
+  %add.57.i925.7 = fadd float %add.57.i925.6, %multiply.53643.7
+  %508 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834888
+  %509 = load i8, ptr addrspace(1) %508, align 4, !invariant.load !142
+  %510 = sitofp i8 %509 to float
+  %multiply.18642.8 = fmul float %510, 0x3FC3BF2820000000
+  %multiply.53643.8 = fmul float %multiply.1032.8, %multiply.18642.8
+  %add.57.i925.8 = fadd float %add.57.i925.7, %multiply.53643.8
+  %511 = fptrunc float %add.57.i925.8 to half
+  %512 = getelementptr inbounds half, ptr addrspace(1) %100, i64 12
+  store half %511, ptr addrspace(1) %512, align 8
+  %513 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833857
+  %514 = load i8, ptr addrspace(1) %513, align 1, !invariant.load !142
+  %515 = sitofp i8 %514 to float
+  %multiply.18694 = fmul float %515, 0x3FC3BF2820000000
+  %multiply.53695 = fmul float %multiply.1032, %multiply.18694
+  %add.57.i926 = fadd float %multiply.53695, 0.000000e+00
+  %516 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833861
+  %517 = load i8, ptr addrspace(1) %516, align 1, !invariant.load !142
+  %518 = sitofp i8 %517 to float
+  %multiply.18694.1 = fmul float %518, 0x3FC3BF2820000000
+  %multiply.53695.1 = fmul float %multiply.1032.1, %multiply.18694.1
+  %add.57.i926.1 = fadd float %add.57.i926, %multiply.53695.1
+  %519 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833865
+  %520 = load i8, ptr addrspace(1) %519, align 1, !invariant.load !142
+  %521 = sitofp i8 %520 to float
+  %multiply.18694.2 = fmul float %521, 0x3FC3BF2820000000
+  %multiply.53695.2 = fmul float %multiply.1032.2, %multiply.18694.2
+  %add.57.i926.2 = fadd float %add.57.i926.1, %multiply.53695.2
+  %522 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834369
+  %523 = load i8, ptr addrspace(1) %522, align 1, !invariant.load !142
+  %524 = sitofp i8 %523 to float
+  %multiply.18694.3 = fmul float %524, 0x3FC3BF2820000000
+  %multiply.53695.3 = fmul float %multiply.1032.3, %multiply.18694.3
+  %add.57.i926.3 = fadd float %add.57.i926.2, %multiply.53695.3
+  %525 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834373
+  %526 = load i8, ptr addrspace(1) %525, align 1, !invariant.load !142
+  %527 = sitofp i8 %526 to float
+  %multiply.18694.4 = fmul float %527, 0x3FC3BF2820000000
+  %multiply.53695.4 = fmul float %multiply.1032.4, %multiply.18694.4
+  %add.57.i926.4 = fadd float %add.57.i926.3, %multiply.53695.4
+  %528 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834377
+  %529 = load i8, ptr addrspace(1) %528, align 1, !invariant.load !142
+  %530 = sitofp i8 %529 to float
+  %multiply.18694.5 = fmul float %530, 0x3FC3BF2820000000
+  %multiply.53695.5 = fmul float %multiply.1032.5, %multiply.18694.5
+  %add.57.i926.5 = fadd float %add.57.i926.4, %multiply.53695.5
+  %531 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834881
+  %532 = load i8, ptr addrspace(1) %531, align 1, !invariant.load !142
+  %533 = sitofp i8 %532 to float
+  %multiply.18694.6 = fmul float %533, 0x3FC3BF2820000000
+  %multiply.53695.6 = fmul float %multiply.1032.6, %multiply.18694.6
+  %add.57.i926.6 = fadd float %add.57.i926.5, %multiply.53695.6
+  %534 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834885
+  %535 = load i8, ptr addrspace(1) %534, align 1, !invariant.load !142
+  %536 = sitofp i8 %535 to float
+  %multiply.18694.7 = fmul float %536, 0x3FC3BF2820000000
+  %multiply.53695.7 = fmul float %multiply.1032.7, %multiply.18694.7
+  %add.57.i926.7 = fadd float %add.57.i926.6, %multiply.53695.7
+  %537 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834889
+  %538 = load i8, ptr addrspace(1) %537, align 1, !invariant.load !142
+  %539 = sitofp i8 %538 to float
+  %multiply.18694.8 = fmul float %539, 0x3FC3BF2820000000
+  %multiply.53695.8 = fmul float %multiply.1032.8, %multiply.18694.8
+  %add.57.i926.8 = fadd float %add.57.i926.7, %multiply.53695.8
+  %540 = fptrunc float %add.57.i926.8 to half
+  %541 = getelementptr inbounds half, ptr addrspace(1) %100, i64 13
+  store half %540, ptr addrspace(1) %541, align 2
+  %542 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833858
+  %543 = load i8, ptr addrspace(1) %542, align 2, !invariant.load !142
+  %544 = sitofp i8 %543 to float
+  %multiply.18746 = fmul float %544, 0x3FC3BF2820000000
+  %multiply.53747 = fmul float %multiply.1032, %multiply.18746
+  %add.57.i927 = fadd float %multiply.53747, 0.000000e+00
+  %545 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833862
+  %546 = load i8, ptr addrspace(1) %545, align 2, !invariant.load !142
+  %547 = sitofp i8 %546 to float
+  %multiply.18746.1 = fmul float %547, 0x3FC3BF2820000000
+  %multiply.53747.1 = fmul float %multiply.1032.1, %multiply.18746.1
+  %add.57.i927.1 = fadd float %add.57.i927, %multiply.53747.1
+  %548 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833866
+  %549 = load i8, ptr addrspace(1) %548, align 2, !invariant.load !142
+  %550 = sitofp i8 %549 to float
+  %multiply.18746.2 = fmul float %550, 0x3FC3BF2820000000
+  %multiply.53747.2 = fmul float %multiply.1032.2, %multiply.18746.2
+  %add.57.i927.2 = fadd float %add.57.i927.1, %multiply.53747.2
+  %551 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834370
+  %552 = load i8, ptr addrspace(1) %551, align 2, !invariant.load !142
+  %553 = sitofp i8 %552 to float
+  %multiply.18746.3 = fmul float %553, 0x3FC3BF2820000000
+  %multiply.53747.3 = fmul float %multiply.1032.3, %multiply.18746.3
+  %add.57.i927.3 = fadd float %add.57.i927.2, %multiply.53747.3
+  %554 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834374
+  %555 = load i8, ptr addrspace(1) %554, align 2, !invariant.load !142
+  %556 = sitofp i8 %555 to float
+  %multiply.18746.4 = fmul float %556, 0x3FC3BF2820000000
+  %multiply.53747.4 = fmul float %multiply.1032.4, %multiply.18746.4
+  %add.57.i927.4 = fadd float %add.57.i927.3, %multiply.53747.4
+  %557 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834378
+  %558 = load i8, ptr addrspace(1) %557, align 2, !invariant.load !142
+  %559 = sitofp i8 %558 to float
+  %multiply.18746.5 = fmul float %559, 0x3FC3BF2820000000
+  %multiply.53747.5 = fmul float %multiply.1032.5, %multiply.18746.5
+  %add.57.i927.5 = fadd float %add.57.i927.4, %multiply.53747.5
+  %560 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834882
+  %561 = load i8, ptr addrspace(1) %560, align 2, !invariant.load !142
+  %562 = sitofp i8 %561 to float
+  %multiply.18746.6 = fmul float %562, 0x3FC3BF2820000000
+  %multiply.53747.6 = fmul float %multiply.1032.6, %multiply.18746.6
+  %add.57.i927.6 = fadd float %add.57.i927.5, %multiply.53747.6
+  %563 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834886
+  %564 = load i8, ptr addrspace(1) %563, align 2, !invariant.load !142
+  %565 = sitofp i8 %564 to float
+  %multiply.18746.7 = fmul float %565, 0x3FC3BF2820000000
+  %multiply.53747.7 = fmul float %multiply.1032.7, %multiply.18746.7
+  %add.57.i927.7 = fadd float %add.57.i927.6, %multiply.53747.7
+  %566 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834890
+  %567 = load i8, ptr addrspace(1) %566, align 2, !invariant.load !142
+  %568 = sitofp i8 %567 to float
+  %multiply.18746.8 = fmul float %568, 0x3FC3BF2820000000
+  %multiply.53747.8 = fmul float %multiply.1032.8, %multiply.18746.8
+  %add.57.i927.8 = fadd float %add.57.i927.7, %multiply.53747.8
+  %569 = fptrunc float %add.57.i927.8 to half
+  %570 = getelementptr inbounds half, ptr addrspace(1) %100, i64 14
+  store half %569, ptr addrspace(1) %570, align 4
+  %571 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833859
+  %572 = load i8, ptr addrspace(1) %571, align 1, !invariant.load !142
+  %573 = sitofp i8 %572 to float
+  %multiply.18798 = fmul float %573, 0x3FC3BF2820000000
+  %multiply.53799 = fmul float %multiply.1032, %multiply.18798
+  %add.57.i928 = fadd float %multiply.53799, 0.000000e+00
+  %574 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833863
+  %575 = load i8, ptr addrspace(1) %574, align 1, !invariant.load !142
+  %576 = sitofp i8 %575 to float
+  %multiply.18798.1 = fmul float %576, 0x3FC3BF2820000000
+  %multiply.53799.1 = fmul float %multiply.1032.1, %multiply.18798.1
+  %add.57.i928.1 = fadd float %add.57.i928, %multiply.53799.1
+  %577 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3833867
+  %578 = load i8, ptr addrspace(1) %577, align 1, !invariant.load !142
+  %579 = sitofp i8 %578 to float
+  %multiply.18798.2 = fmul float %579, 0x3FC3BF2820000000
+  %multiply.53799.2 = fmul float %multiply.1032.2, %multiply.18798.2
+  %add.57.i928.2 = fadd float %add.57.i928.1, %multiply.53799.2
+  %580 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834371
+  %581 = load i8, ptr addrspace(1) %580, align 1, !invariant.load !142
+  %582 = sitofp i8 %581 to float
+  %multiply.18798.3 = fmul float %582, 0x3FC3BF2820000000
+  %multiply.53799.3 = fmul float %multiply.1032.3, %multiply.18798.3
+  %add.57.i928.3 = fadd float %add.57.i928.2, %multiply.53799.3
+  %583 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834375
+  %584 = load i8, ptr addrspace(1) %583, align 1, !invariant.load !142
+  %585 = sitofp i8 %584 to float
+  %multiply.18798.4 = fmul float %585, 0x3FC3BF2820000000
+  %multiply.53799.4 = fmul float %multiply.1032.4, %multiply.18798.4
+  %add.57.i928.4 = fadd float %add.57.i928.3, %multiply.53799.4
+  %586 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834379
+  %587 = load i8, ptr addrspace(1) %586, align 1, !invariant.load !142
+  %588 = sitofp i8 %587 to float
+  %multiply.18798.5 = fmul float %588, 0x3FC3BF2820000000
+  %multiply.53799.5 = fmul float %multiply.1032.5, %multiply.18798.5
+  %add.57.i928.5 = fadd float %add.57.i928.4, %multiply.53799.5
+  %589 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834883
+  %590 = load i8, ptr addrspace(1) %589, align 1, !invariant.load !142
+  %591 = sitofp i8 %590 to float
+  %multiply.18798.6 = fmul float %591, 0x3FC3BF2820000000
+  %multiply.53799.6 = fmul float %multiply.1032.6, %multiply.18798.6
+  %add.57.i928.6 = fadd float %add.57.i928.5, %multiply.53799.6
+  %592 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834887
+  %593 = load i8, ptr addrspace(1) %592, align 1, !invariant.load !142
+  %594 = sitofp i8 %593 to float
+  %multiply.18798.7 = fmul float %594, 0x3FC3BF2820000000
+  %multiply.53799.7 = fmul float %multiply.1032.7, %multiply.18798.7
+  %add.57.i928.7 = fadd float %add.57.i928.6, %multiply.53799.7
+  %595 = getelementptr inbounds i8, ptr addrspace(1) %144, i64 3834891
+  %596 = load i8, ptr addrspace(1) %595, align 1, !invariant.load !142
+  %597 = sitofp i8 %596 to float
+  %multiply.18798.8 = fmul float %597, 0x3FC3BF2820000000
+  %multiply.53799.8 = fmul float %multiply.1032.8, %multiply.18798.8
+  %add.57.i928.8 = fadd float %add.57.i928.7, %multiply.53799.8
+  %598 = fptrunc float %add.57.i928.8 to half
+  %599 = getelementptr inbounds half, ptr addrspace(1) %100, i64 15
+  store half %598, ptr addrspace(1) %599, align 2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!140 = !{i32 0, i32 8658}
+!141 = !{i32 0, i32 64}
+!142 = !{}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/overlapping_chains.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/overlapping_chains.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/overlapping_chains.ll
@@ -0,0 +1,17 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+; CHECK-LABEL: @overlapping_stores
+; CHECK: store i16
+; CHECK: store i16
+; CHECK: store i16
+define void @overlapping_stores(ptr nocapture align 2 %ptr) {
+  %ptr0 = getelementptr i16, ptr %ptr, i64 0
+  %ptr1 = getelementptr i8, ptr %ptr, i64 1
+  %ptr2 = getelementptr i16, ptr %ptr, i64 1
+
+  store i16 0, ptr %ptr0, align 2
+  store i16 0, ptr %ptr1, align 1
+  store i16 0, ptr %ptr2, align 2
+
+  ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i1.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i1.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i1.ll
@@ -0,0 +1,33 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+define void @i1x8(ptr nocapture align 4 %ptr) {
+  %ptr0 = getelementptr i8, ptr %ptr, i64 0
+  %ptr1 = getelementptr i8, ptr %ptr, i64 1
+  %ptr2 = getelementptr i8, ptr %ptr, i64 2
+  %ptr3 = getelementptr i8, ptr %ptr, i64 3
+
+  %l0 = load <8 x i1>, ptr %ptr0, align 4
+  %l1 = load <8 x i1>, ptr %ptr1, align 1
+  %l2 = load <8 x i1>, ptr %ptr2, align 2
+  %l3 = load <8 x i1>, ptr %ptr3, align 1
+
+  ret void
+
+; CHECK-LABEL: @i1x8
+; CHECK-DAG: load <32 x i1>
+}
+
+define void @i1x8x16x8(ptr nocapture align 4 %ptr) {
+  %ptr0 = getelementptr i8, ptr %ptr, i64 0
+  %ptr1 = getelementptr i8, ptr %ptr, i64 1
+  %ptr2 = getelementptr i8, ptr %ptr, i64 3
+
+  %l0 = load <8 x i1>,  ptr %ptr0, align 4
+  %l2 = load <16 x i1>, ptr %ptr1, align 1
+  %l3 = load <8 x i1>,  ptr %ptr2, align 1
+
+  ret void
+
+; CHECK-LABEL: @i1x8x16x8
+; CHECK-DAG: load <32 x i1>
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i16.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i16.ll
@@ -0,0 +1,17 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+; CHECK-LABEL: @int16x2
+; CHECK: load <2 x i16>
+; CHECK: store <2 x i16>
+define void @int16x2(ptr nocapture align 4 %ptr) {
+  %ptr0 = getelementptr i16, ptr %ptr, i64 0
+  %ptr1 = getelementptr i16, ptr %ptr, i64 1
+
+  %l0 = load i16, ptr %ptr0, align 4
+  %l1 = load i16, ptr %ptr1, align 2
+
+  store i16 %l1, ptr %ptr0, align 4
+  store i16 %l0, ptr %ptr1, align 2
+
+  ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i24.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i24.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i24.ll
@@ -0,0 +1,21 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+; We don't need to vectorize this.  Just make sure it doesn't crash.
+
+; CHECK-LABEL: @int24x2
+; CHECK: load i24
+; CHECK: load i24
+; CHECK: store i24
+; CHECK: store i24
+define void @int24x2(ptr nocapture align 4 %ptr) {
+  %ptr0 = getelementptr i24, ptr %ptr, i64 0
+  %ptr1 = getelementptr i24, ptr %ptr, i64 1
+
+  %l0 = load i24, ptr %ptr0, align 4
+  %l1 = load i24, ptr %ptr1, align 1
+
+  store i24 %l1, ptr %ptr0, align 4
+  store i24 %l0, ptr %ptr1, align 1
+
+  ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
 
 ; Vectorize and emit valid code (Issue #54896).
@@ -41,8 +40,10 @@
   ret void
 
 ; CHECK-LABEL: @int8x3a4
-; CHECK: load <3 x i8>
-; CHECK: store <3 x i8>
+; CHECK: load <2 x i8>
+; CHECK: load i8
+; CHECK: store <2 x i8>
+; CHECK: store i8
 }
 
 define void @int8x12a4(ptr nocapture align 4 %ptr) {
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_vectors.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_vectors.ll
@@ -0,0 +1,17 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+; CHECK-LABEL: @int8x3Plus1
+; CHECK: load <4 x i8>
+; CHECK: store <4 x i8>
+define void @int8x3Plus1(ptr nocapture align 4 %ptr) {
+  %ptr0 = getelementptr i8, ptr %ptr, i64 0
+  %ptr3 = getelementptr i8, ptr %ptr, i64 3
+
+  %l0 = load <3 x i8>, ptr %ptr0, align 4
+  %l1 = load i8, ptr %ptr3, align 1
+
+  store <3 x i8> <i8 0, i8 0, i8 0>, ptr %ptr0, align 4
+  store i8 0, ptr %ptr3, align 1
+
+  ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
@@ -7,12 +7,13 @@
 define void @correct_order(ptr noalias %ptr) {
 ; CHECK-LABEL: @correct_order(
 ; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[NEXT_GEP1]], align 4
-; CHECK-NEXT:    [[L11:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[L42:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[L21:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[L12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[NEXT_GEP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[NEXT_GEP1]], align 4
+; CHECK-NEXT:    [[L33:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[L44:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    ret void
 ;
   %next.gep1 = getelementptr i32, ptr %ptr, i64 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
@@ -8,9 +8,8 @@
 
 ; CHECK-LABEL: @interleave_2L_2S(
 ; CHECK: load <2 x i32>
-; CHECK: load i32
 ; CHECK: store <2 x i32>
-; CHECK: load i32
+; CHECK: load <2 x i32>
 define void @interleave_2L_2S(ptr noalias %ptr) {
   %next.gep1 = getelementptr i32, ptr %ptr, i64 1
   %next.gep2 = getelementptr i32, ptr %ptr, i64 2
@@ -26,9 +25,9 @@
 }
 
 ; CHECK-LABEL: @interleave_3L_2S_1L(
-; CHECK: load <3 x i32>
+; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32>
-; CHECK: load i32
+; CHECK: load <2 x i32>
 
 define void @interleave_3L_2S_1L(ptr noalias %ptr) {
   %next.gep1 = getelementptr i32, ptr %ptr, i64 1
@@ -82,15 +81,10 @@
   ret void
 }
 
-; FIXME: If the chain is too long and TLI says misaligned is not fast,
-; then LSV fails to vectorize anything in that chain.
-; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7.
-
 ; CHECK-LABEL: @interleave_get_longest
-; CHECK: load <3 x i32>
-; CHECK: load i32
+; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32> zeroinitializer
-; CHECK: load i32
+; CHECK: load <3 x i32>
 ; CHECK: load i32
 ; CHECK: load i32
 
@@ -98,6 +92,7 @@
   %tmp2 = getelementptr i32, ptr %ptr, i64 1
   %tmp3 = getelementptr i32, ptr %ptr, i64 2
   %tmp4 = getelementptr i32, ptr %ptr, i64 3
+  %tmp5 = getelementptr i32, ptr %ptr, i64 4
 
   %l1 = load i32, ptr %tmp2, align 4
   %l2 = load i32, ptr %ptr, align 4
@@ -106,8 +101,32 @@
   %l3 = load i32, ptr %tmp2, align 4
   %l4 = load i32, ptr %tmp3, align 4
   %l5 = load i32, ptr %tmp4, align 4
-  %l6 = load i32, ptr %tmp4, align 4
-  %l7 = load i32, ptr %tmp4, align 4
+  %l6 = load i32, ptr %tmp5, align 4
+  %l7 = load i32, ptr %tmp5, align 4
 
   ret void
 }
+
+; CHECK-LABEL: @interleave_get_longest_aligned
+; CHECK: load <2 x i32>
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: load <4 x i32>
+
+define void @interleave_get_longest_aligned(ptr noalias %ptr) {
+  %tmp2 = getelementptr i32, ptr %ptr, i64 1
+  %tmp3 = getelementptr i32, ptr %ptr, i64 2
+  %tmp4 = getelementptr i32, ptr %ptr, i64 3
+  %tmp5 = getelementptr i32, ptr %ptr, i64 4
+
+  %l1 = load i32, ptr %tmp2, align 4
+  %l2 = load i32, ptr %ptr, align 4
+  store i32 0, ptr %tmp2, align 4
+  store i32 0, ptr %ptr, align 4
+  %l3 = load i32, ptr %tmp2, align 16
+  %l4 = load i32, ptr %tmp3, align 4
+  %l5 = load i32, ptr %tmp4, align 8
+  %l6 = load i32, ptr %tmp5, align 4
+  %l7 = load i32, ptr %tmp5, align 4
+
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
@@ -4,8 +4,7 @@
 ; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T.
 
 ; CHECK-LABEL: @vector_scalar(
-; CHECK: store double
-; CHECK: store <1 x double>
+; CHECK: store <2 x double>
 define void @vector_scalar(ptr %ptr, double %a, <1 x double> %b) {
   %1 = getelementptr <1 x double>, ptr %ptr, i32 1
   store double %a, ptr %ptr, align 8
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll
@@ -55,53 +55,6 @@
   ret void
 }
 
-define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
-; CHECK-LABEL: @ld_v4i8_add_nuw(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> poison, i8 [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    ret void
-;
-bb:
-  %tmp = add nuw i32 %v0, -1
-  %tmp1 = add nuw i32 %v1, %tmp
-  %tmp2 = zext i32 %tmp1 to i64
-  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
-  %tmp4 = load i8, ptr %tmp3, align 1
-  %tmp5 = add nuw i32 %v1, %v0
-  %tmp6 = zext i32 %tmp5 to i64
-  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
-  %tmp8 = load i8, ptr %tmp7, align 1
-  %tmp9 = add nuw i32 %v0, 1
-  %tmp10 = add nuw i32 %v1, %tmp9
-  %tmp11 = zext i32 %tmp10 to i64
-  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
-  %tmp13 = load i8, ptr %tmp12, align 1
-  %tmp14 = add nuw i32 %v0, 2
-  %tmp15 = add nuw i32 %v1, %tmp14
-  %tmp16 = zext i32 %tmp15 to i64
-  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
-  %tmp18 = load i8, ptr %tmp17, align 1
-  %tmp19 = insertelement <4 x i8> poison, i8 %tmp4, i32 0
-  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
-  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
-  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
-  store <4 x i8> %tmp22, ptr %dst
-  ret void
-}
-
 ; Make sure we don't vectorize the loads below because the source of
 ; sext instructions doesn't have the nsw flag.
 
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll
@@ -55,53 +55,6 @@
   ret void
 }
 
-define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
-; CHECK-LABEL: @ld_v4i8_add_nuw(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
-; CHECK-NEXT:    ret void
-;
-bb:
-  %tmp = add nuw i32 %v0, -1
-  %tmp1 = add nuw i32 %v1, %tmp
-  %tmp2 = zext i32 %tmp1 to i64
-  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
-  %tmp4 = load i8, ptr %tmp3, align 1
-  %tmp5 = add nuw i32 %v1, %v0
-  %tmp6 = zext i32 %tmp5 to i64
-  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
-  %tmp8 = load i8, ptr %tmp7, align 1
-  %tmp9 = add nuw i32 %v0, 1
-  %tmp10 = add nuw i32 %v1, %tmp9
-  %tmp11 = zext i32 %tmp10 to i64
-  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
-  %tmp13 = load i8, ptr %tmp12, align 1
-  %tmp14 = add nuw i32 %v0, 2
-  %tmp15 = add nuw i32 %v1, %tmp14
-  %tmp16 = zext i32 %tmp15 to i64
-  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
-  %tmp18 = load i8, ptr %tmp17, align 1
-  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
-  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
-  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
-  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
-  store <4 x i8> %tmp22, ptr %dst
-  ret void
-}
-
 ; Apply different operand orders for the nested add sequences
 define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
 ; CHECK-LABEL: @ld_v4i8_add_nsw_operand_orders(
@@ -150,54 +103,6 @@
   ret void
 }
 
-; Apply different operand orders for the nested add sequences
-define void @ld_v4i8_add_nuw_operand_orders(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
-; CHECK-LABEL: @ld_v4i8_add_nuw_operand_orders(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
-; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
-; CHECK-NEXT:    ret void
-;
-bb:
-  %tmp = add nuw i32 %v0, -1
-  %tmp1 = add nuw i32 %v1, %tmp
-  %tmp2 = zext i32 %tmp1 to i64
-  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
-  %tmp4 = load i8, ptr %tmp3, align 1
-  %tmp5 = add nuw i32 %v0, %v1
-  %tmp6 = zext i32 %tmp5 to i64
-  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
-  %tmp8 = load i8, ptr %tmp7, align 1
-  %tmp9 = add nuw i32 %v0, 1
-  %tmp10 = add nuw i32 %tmp9, %v1
-  %tmp11 = zext i32 %tmp10 to i64
-  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
-  %tmp13 = load i8, ptr %tmp12, align 1
-  %tmp14 = add nuw i32 %v0, 2
-  %tmp15 = add nuw i32 %v1, %tmp14
-  %tmp16 = zext i32 %tmp15 to i64
-  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
-  %tmp18 = load i8, ptr %tmp17, align 1
-  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
-  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
-  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
-  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
-  store <4 x i8> %tmp22, ptr %dst
-  ret void
-}
-
 define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
 ; CHECK-LABEL: @ld_v4i8_add_known_bits(
 ; CHECK-NEXT:  bb:
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll b/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
--- a/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
@@ -78,9 +78,9 @@
 ; CHECK-NEXT:    [[P2:%.*]] = getelementptr float, ptr [[P]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, ptr [[P]], i64 3
 ; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[P]], align 16
+; CHECK-NEXT:    call void @foo() #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[P1]], align 4
 ; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[P2]], align 4
-; CHECK-NEXT:    call void @foo() #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[L3:%.*]] = load float, ptr [[P3]], align 4
 ; CHECK-NEXT:    store float [[L0]], ptr [[P]], align 16
 ; CHECK-NEXT:    call void @foo() #[[ATTR2]]
@@ -93,9 +93,9 @@
   %p2 = getelementptr float, ptr %p, i64 2
   %p3 = getelementptr float, ptr %p, i64 3
   %l0 = load float, ptr %p, align 16
+  call void @foo() inaccessiblememonly nounwind
   %l1 = load float, ptr %p1
   %l2 = load float, ptr %p2
-  call void @foo() inaccessiblememonly nounwind
   %l3 = load float, ptr %p3
   store float %l0, ptr %p, align 16
   call void @foo() inaccessiblememonly nounwind
@@ -111,9 +111,9 @@
 ; CHECK-NEXT:    [[P2:%.*]] = getelementptr float, ptr [[P]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, ptr [[P]], i64 3
 ; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[P]], align 16
+; CHECK-NEXT:    call void @foo() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[P1]], align 4
 ; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[P2]], align 4
-; CHECK-NEXT:    call void @foo() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[L3:%.*]] = load float, ptr [[P3]], align 4
 ; CHECK-NEXT:    store float [[L0]], ptr [[P]], align 16
 ; CHECK-NEXT:    call void @foo() #[[ATTR3]]
@@ -126,9 +126,9 @@
   %p2 = getelementptr float, ptr %p, i64 2
   %p3 = getelementptr float, ptr %p, i64 3
   %l0 = load float, ptr %p, align 16
+  call void @foo() inaccessiblememonly willreturn
   %l1 = load float, ptr %p1
   %l2 = load float, ptr %p2
-  call void @foo() inaccessiblememonly willreturn
   %l3 = load float, ptr %p3
   store float %l0, ptr %p, align 16
   call void @foo() inaccessiblememonly willreturn