Index: llvm/lib/Target/ARC/ARC.h
===================================================================
--- llvm/lib/Target/ARC/ARC.h
+++ llvm/lib/Target/ARC/ARC.h
@@ -19,13 +19,14 @@
 
 namespace llvm {
 
-class FunctionPass;
+class ARCSubtarget;
 class ARCTargetMachine;
+class FunctionPass;
 
 FunctionPass *createARCISelDag(ARCTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 FunctionPass *createARCExpandPseudosPass();
-FunctionPass *createARCOptAddrMode();
+FunctionPass *createARCOptAddrMode(const ARCSubtarget &);
 FunctionPass *createARCBranchFinalizePass();
 
 } // end namespace llvm
Index: llvm/lib/Target/ARC/ARCInstrInfo.td
===================================================================
--- llvm/lib/Target/ARC/ARCInstrInfo.td
+++ llvm/lib/Target/ARC/ARCInstrInfo.td
@@ -133,6 +133,9 @@
                              "STB_FAR $dst, $addr",
                              [(truncstorei8 GPR32:$dst, AddrModeFar:$addr)]>;
 
+// To be deleted opcode
+def TBD : PseudoInstARC<(outs),(ins),"TO_BE_DELETED",[]>;
+
 // TODO: Add `Requires<[HasBitScan]>` predicate to these when available.
 let Defs = [STATUS32] in {
   def CTLZ : PseudoInstARC<(outs GPR32:$A),
@@ -289,6 +292,9 @@
 
 // Definitions for 3 operand binary instructions.
 defm ADD : ArcBinaryGEN4Inst<0b000000, "add",1>;
+defm ADD1 : ArcBinaryGEN4Inst<0b010100, "add1">;
+defm ADD2 : ArcBinaryGEN4Inst<0b010101, "add2">;
+defm ADD3 : ArcBinaryGEN4Inst<0b010110, "add3">;
 defm SUB : ArcBinaryGEN4Inst<0b000010, "sub">;
 defm SUB1 : ArcBinaryGEN4Inst<0b010111, "sub1">;
 defm SUB2 : ArcBinaryGEN4Inst<0b011000, "sub2">;
Index: llvm/lib/Target/ARC/ARCOptAddrMode.cpp
===================================================================
--- llvm/lib/Target/ARC/ARCOptAddrMode.cpp
+++ llvm/lib/Target/ARC/ARCOptAddrMode.cpp
@@ -15,11 +15,14 @@
 #define GET_INSTRMAP_INFO
 #include "ARCInstrInfo.h"
 #include "ARCTargetMachine.h"
+#include "ARCUtil.h"
+#include "MCTargetDesc/ARCMCUtil.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
@@ -34,6 +37,13 @@
 #define DEBUG_TYPE "arc-addr-mode"
 
 namespace llvm {
+#if 0
+} fix emacs;
+#endif
+
+// Compute best -Os/-Os1 settings
+static cl::opt<unsigned> ArcAbawMaxSpace("arc-abaw-max-space", cl::init(16),
+                                         cl::ReallyHidden, cl::ZeroOrMore);
 
 static cl::opt<unsigned> ArcKillAddrMode("arc-kill-addr-mode", cl::init(0),
                                          cl::ReallyHidden, cl::ZeroOrMore);
@@ -43,9 +53,257 @@
 #define VIEW_BEFORE() ((ArcKillAddrMode & 0x0004) != 0)
 #define VIEW_AFTER() ((ArcKillAddrMode & 0x0008) != 0)
 #define KILL_PASS() ((ArcKillAddrMode & 0x0010) != 0)
+#define SINK_LDST() ((ArcKillAddrMode & 0x00000020) != 0)
+#define GEN_PSEUDO_INC() ((ArcKillAddrMode & 0x00000040) != 0)
+#define KILL_VDSP_VVLD() ((ArcKillAddrMode & 0x00000080) != 0)
 
-FunctionPass *createARCOptAddrMode();
 void initializeARCOptAddrModePass(PassRegistry &);
+
+// TODO port
+#define mayHaveShortForm(x) false
+
+namespace ARC {
+#if 0
+} fix emacs;
+#endif
+
+// TODO
+bool isVectorInstr(long long) { return false; }
+
+// Structure to represent register increment
+// Reg - increment value
+// Scale - scale, as present in inctructions like ADD2, LD_rras etc
+struct RegIncrement {
+  unsigned Reg;
+  unsigned Scale;
+};
+
+// Structure to represent Increment/Offset of instruction. Can be immediate or
+// (scaled) register
+struct BaseIncr {
+  bool IsImm = false;
+  union {
+    RegIncrement RI;
+    int64_t LI = 0;
+  } u;
+  BaseIncr() : IsImm(false) { u.LI = 0; }
+  BaseIncr(int64_t ImmOff) : IsImm(true) { u.LI = ImmOff; }
+  BaseIncr(unsigned Reg, unsigned Scale) : IsImm(false) { u.RI = {Reg, Scale}; }
+
+  BaseIncr(const BaseIncr &other) {
+    IsImm = other.IsImm;
+    u = other.u;
+  }
+
+  bool operator==(const BaseIncr &other) {
+    if (IsImm != other.IsImm)
+      return false;
+    if (IsImm)
+      return u.LI == other.u.LI;
+    else
+      return (u.RI.Reg = other.u.RI.Reg && u.RI.Scale == other.u.RI.Scale);
+  }
+  bool operator!=(const BaseIncr &other) { return !operator==(other); }
+
+  int64_t getImm() const {
+    assert(IsImm && "not an immediate");
+    return u.LI;
+  }
+  unsigned getReg() const {
+    assert(!IsImm && "not a reg incr");
+    return u.RI.Reg;
+  }
+  unsigned getScale() const {
+    assert(!IsImm && "not a reg incr");
+    return u.RI.Scale;
+  }
+
+  void setImm(int64_t Imm) {
+    assert(IsImm && "not an immediate");
+    u.LI = Imm;
+  }
+  void setReg(unsigned Reg) {
+    assert(!IsImm && "not a reg incr");
+    u.RI.Reg = Reg;
+  }
+  void setScale(unsigned Scale) {
+    assert(!IsImm && "not a reg incr");
+    u.RI.Scale = Scale;
+  }
+
+  RegIncrement &getRegIncrement() { return u.RI; }
+
+  void print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
+    OS << "Offset: ";
+    if (IsImm)
+      OS << getImm();
+    else
+      OS << printReg(getReg(), TRI) << " * " << getScale();
+  }
+};
+
+class ABAW : public ARC::SsaInstructionVisitor {
+
+  typedef SmallVectorImpl<MachineInstr *> InstrVector;
+  typedef std::pair<MachineInstr *, BaseIncr> InstIncrPair;
+
+  MachineDominatorTree &DOM;
+  const MachineLoopInfo &MLI;
+  bool IsOptimizeForSpace = false;
+  bool InSWPCandidate = false;
+
+  // Map of tied registers with non-immediate increments
+  // E.g., generating
+  // %vreg104<def>, %vreg100<def,tied2> = LDD_rr_ab %vreg99<tied1>, %vreg32
+  // will add to this map:
+  // tiedRegisterMap[%vreg99] = {%vreg100, %vreg32, LDD_rr_ab};
+  // We use it to perform kind of copy propagation for tied registers.
+  // See STAR 9001160762 for example where it could be useful
+  struct TiedRegIncrement {
+    unsigned newReg;
+    unsigned incReg;
+    MachineInstr *def;
+  };
+  std::map<unsigned, TiedRegIncrement> tiedRegisterMap;
+
+  // Instruction ordinals within BB.
+  // Used for quick lexical ordering of MIs within BB to avoid
+  // quadratic behaviour of dominates()
+  std::map<const MachineInstr *, unsigned> Ordinals;
+
+  // Vectors of candidate load/stores and increments
+  SmallVector<InstIncrPair, 8> Candidates, Increments; // TODO: MapVector maybe?
+
+  // Vector of uses of base register outside current BB in a block dominated by
+  // BB
+  // TODO: they really can be made InstIncrPair. If we cannot recognize it, we
+  // cannot fix it anyway
+  SmallVector<MachineInstr *, 2> ExternalDominatedUses;
+
+  // Vector of uses of base register outside current BB in a block that
+  // dominates BB
+  SmallVector<MachineInstr *, 2> ExternalDominatingUses;
+
+  // If true, generate preincrement form instead of postincrement
+  bool GeneratePreInc = false;
+
+  MachineInstr *CurrentInst = nullptr;
+
+public:
+  ABAW(const ARCSubtarget &ST, MachineFunction &MF, MachineDominatorTree &DOM,
+       const MachineLoopInfo &MLI)
+      : SsaInstructionVisitor(ST, MF), DOM(DOM), MLI(MLI) {
+    IsOptimizeForSpace = ARC::isOptimizeForSpace(MF);
+  }
+
+  bool visit(MachineInstr &) override;
+  void preBlockCallout(MachineBasicBlock &) override;
+  void postBlockCallout(MachineBasicBlock &, bool) override;
+
+private:
+  // Check if either one instruction can be moved (up or down) to another.
+  // Returns instruction which another can be moved to (i.e., one not moved) or
+  // nullptr if nothing can be moved
+  MachineInstr *canJoinInstructions(MachineInstr *ldst, MachineInstr *add);
+
+  // Checks that there are no uses of add in interval (add, ldst)
+  bool noUseOfAddBeforeLoadOrStore(MachineInstr *add, MachineInstr *ldst);
+
+  // Update instruction operands accorging to pre/post-increment form.
+  // MI         - instruction to update
+  // NewBaseReg - register for new base
+  // BaseReg    - base register
+  // NewOffset  - offset
+  void setLoadStoreBaseOffset(MachineInstr &MI, unsigned NewBaseReg,
+                              unsigned BaseReg,
+                              const MachineOperand &NewOffset);
+
+  void setLoadStoreBaseOffset(MachineInstr &MI, unsigned NewBaseReg,
+                              unsigned BaseReg, int64_t NewOffset);
+
+  // Is "MI" the result of adding a constant or register to itself?
+  // If so, make the first source operand reference the PHI
+  bool isSelfIncrementing(MachineInstr &MI) const;
+
+  // Check if load/store instruction 'Ldst' can be hoisted up to instruction
+  // 'To'
+  bool canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
+
+  // Check if load/store instruction 'Ldst' can be sunk down to instruction 'To'
+  bool canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
+
+  // Return true if all instructions in 'Uses' can be updated to accomodate
+  // BaseReg's increment by instruction Incr
+  bool canFixPastUses(const InstrVector &Uses, MachineInstr *Incr,
+                      unsigned BaseReg);
+
+  // Adjust all uses of 'base' after 'ldst' to accomodate base increment by
+  // newOffset
+  void fixIntermediates(MachineInstr *ldst, unsigned newBase, int64_t newOffset,
+                        unsigned oldBase);
+
+  // As above, but increment is not a literal but register 'Addend' scaled by
+  // 'Scale'
+  void fixIntermediatesReg(MachineInstr *ldst, unsigned newBase,
+                           unsigned Addend, unsigned Scale, unsigned Base);
+
+  // Try to combine load/store instruction 'Ldst' with base register increment
+  bool tryToCombine(MachineInstr &Ldst);
+
+  bool hasOnePhiUse(const MachineOperand &opd) const;
+  bool transformPhiForBetterLICM(MachineInstr *Phi, MachineInstr *Ldst,
+                                 unsigned BaseIdx, unsigned OffIdx);
+
+  void clear();
+
+  // Collect instruction in MBB using BaseReg as a base register
+  // Return true is something has been found
+  bool collectCandidates(unsigned BaseReg, MachineBasicBlock *MBB);
+
+  // Check candidates to see if address postincrement can be applied
+  bool analyzeCandidates(unsigned BaseReg, MachineBasicBlock *MBB);
+
+  // Analysis specific for instructions with immediate offsets
+  bool analyzeCandidatesImm(unsigned BaseReg, MachineBasicBlock *MBB);
+
+  // Analysis specific for instructions with register offsets
+  bool analyzeCandidatesReg(unsigned BaseReg, MachineBasicBlock *MBB);
+
+  // Verify all "Candidates" have increment forms
+  bool validateIncrementForms() const;
+
+  // Apply transformation to eligible instructions
+  bool transformCandidates(unsigned BaseReg, MachineBasicBlock *MBB);
+
+  // In case base address if loop IV, try to transform is so that
+  // first load/store instruction has zero offset
+  bool tryToTransformPHI(unsigned BaseReg, MachineBasicBlock *MBB);
+
+  // Try to reorder loads/stores so that their offsets are more
+  // amenable to transformation
+  bool tryToReorderCandidatesImm();
+
+  // See if address preincrement can be generated instead of postincrement
+  // Somtimes it can be simpler transformation
+  bool tryToGenPreIncImm();
+
+  // Check if candidate instructions can be changed to postincrement form
+  bool checkCandidatesImm();
+
+  unsigned getConversionToAW(MachineInstr &MI) const;
+  unsigned getConversionToAB(MachineInstr &MI) const;
+  unsigned getVDSPAVVariant(unsigned ABOpcode) const;
+
+  void computeOrdinals(const MachineBasicBlock &MBB) {
+    Ordinals.clear();
+    unsigned N = 1;
+    for (auto &I : MBB)
+      Ordinals[&I] = N++;
+  }
+};
+
+} // end namespace ARC
+
 } // end namespace llvm
 
 namespace {
@@ -53,13 +311,19 @@
 public:
   static char ID;
 
+  const ARCSubtarget *ST;
+
   ARCOptAddrMode() : MachineFunctionPass(ID) {}
 
+  ARCOptAddrMode(const ARCSubtarget &ST) : MachineFunctionPass(ID), ST(&ST) {}
+
   StringRef getPassName() const override { return OPTADDRMODE_DESC; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
   }
@@ -67,51 +331,6 @@
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
-  const ARCSubtarget *AST = nullptr;
-  const ARCInstrInfo *AII = nullptr;
-  MachineRegisterInfo *MRI = nullptr;
-  MachineDominatorTree *MDT = nullptr;
-
-  // Tries to combine \p Ldst with increment of its base register to form
-  // single post-increment instruction.
-  MachineInstr *tryToCombine(MachineInstr &Ldst);
-
-  // Returns true if result of \p Add is not used before \p Ldst
-  bool noUseOfAddBeforeLoadOrStore(const MachineInstr *Add,
-                                   const MachineInstr *Ldst);
-
-  // Returns true if load/store instruction \p Ldst can be hoisted up to
-  // instruction \p To
-  bool canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
-
-  // // Returns true if load/store instruction \p Ldst can be sunk down
-  // // to instruction \p To
-  // bool canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
-
-  // Check if instructions \p Ldst and \p Add can be moved to become adjacent
-  // If they can return instruction which need not to move.
-  // If \p Uses is not null, fill it with instructions after \p Ldst which use
-  // \p Ldst's base register
-  MachineInstr *canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add,
-                                    SmallVectorImpl<MachineInstr *> *Uses);
-
-  // Returns true if all instruction in \p Uses array can be adjusted
-  // to accomodate increment of register \p BaseReg by \p Incr
-  bool canFixPastUses(const ArrayRef<MachineInstr *> &Uses,
-                      MachineOperand &Incr, unsigned BaseReg);
-
-  // Update all instructions in \p Uses to accomodate increment
-  // of \p BaseReg by \p Offset
-  void fixPastUses(ArrayRef<MachineInstr *> Uses, unsigned BaseReg,
-                   int64_t Offset);
-
-  // Change instruction \p Ldst to postincrement form.
-  // \p NewBase is register to hold update base value
-  // \p NewOffset is instruction's new offset
-  void changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode,
-                        unsigned NewBase, MachineOperand &NewOffset);
-
-  bool processBasicBlock(MachineBasicBlock &MBB);
 };
 
 } // end anonymous namespace
@@ -119,285 +338,1002 @@
 char ARCOptAddrMode::ID = 0;
 INITIALIZE_PASS_BEGIN(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false,
                       false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false,
                     false)
 
-// Return true if \p Off can be used as immediate offset
-// operand of load/store instruction (S9 literal)
-static bool isValidLoadStoreOffset(int64_t Off) { return isInt<9>(Off); }
+bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()) || KILL_PASS())
+    return false;
+
+  if (DUMP_BEFORE())
+    MF.dump();
+  if (VIEW_BEFORE())
+    MF.viewCFG();
 
-// Return true if \p Off can be used as immediate operand of
-// ADD/SUB instruction (U6 literal)
-static bool isValidIncrementOffset(int64_t Off) { return isUInt<6>(Off); }
+  LLVM_DEBUG(dbgs() << ">>>Starting ARCOptAddrMode pass for "
+                    << MF.getFunction().getName() << "\n");
+  bool Changed = ARC::ABAW(*ST, MF, getAnalysis<MachineDominatorTree>(),
+                           getAnalysis<MachineLoopInfo>())
+                     .examineEachInstr();
+  LLVM_DEBUG(dbgs() << ">>>Ended ARCOptAddrMode pass for "
+                    << MF.getFunction().getName() << " with result " << Changed
+                    << "\n");
 
-static bool isAddConstantOp(const MachineInstr &MI, int64_t &Amount) {
-  int64_t Sign = 1;
-  switch (MI.getOpcode()) {
-  case ARC::SUB_rru6:
-    Sign = -1;
-    LLVM_FALLTHROUGH;
-  case ARC::ADD_rru6:
-    assert(MI.getOperand(2).isImm() && "Expected immediate operand");
-    Amount = Sign * MI.getOperand(2).getImm();
-    return true;
-  default:
-    return false;
+  if (DUMP_AFTER())
+    MF.dump();
+  if (VIEW_AFTER())
+    MF.viewCFG();
+  return Changed;
+}
+
+// Returns TRUE if opd has exactly one use by a PHI instruction
+bool ARC::ABAW::hasOnePhiUse(const MachineOperand &opd) const {
+  int count = 0;
+  for (MachineInstr &I : MRI.use_nodbg_instructions(opd.getReg())) {
+    if (I.getOpcode() != ARC::PHI)
+      return false;
+    ++count;
   }
+  return count == 1;
 }
 
-// Return true if \p MI dominates of uses of virtual register \p VReg
-static bool dominatesAllUsesOf(const MachineInstr *MI, unsigned VReg,
-                               MachineDominatorTree *MDT,
-                               MachineRegisterInfo *MRI) {
+// Try to change
+// BB#1:
+//   %r0 = ...
+//
+// BB#2:
+//   %r1 = PHI %r0, <BB#1>, %r2, <BB#2>
+//   %r3 = LD_rs9 %r1, off
+//   %r2 = ADD_rru6 %r1, inc
+//
+// to
+//
+// BB#1:
+//   %r0 = ...
+//   %r4 = %r0 + off
+//
+// BB#2:
+//   %r1 = PHI %r4, <BB#1>, %r2, <BB#2>
+//   %r3 = LD_rs9 %r1, 0
+//   %r2 = ADD_rru6 %r1, inc
+//
+bool ARC::ABAW::tryToTransformPHI(unsigned BaseReg, MachineBasicBlock *MBB) {
+  // See if we can change first load/store to have zero offset
+  LLVM_DEBUG(dbgs() << "\toffset of first mem instr is non-zero - check if it "
+                       "can be zeroed\n");
+
+  if (!ExternalDominatedUses.empty()) {
+    return false; // TODO: it may be possible to adjust them
+  }
 
-  assert(Register::isVirtualRegister(VReg) && "Expected virtual register!");
+  // Our base address must be simple loop local variable (not leaking outside)
+  MachineInstr *PHI = getSsaDef(BaseReg);
+  if (!PHI || PHI->getParent() != MBB || !PHI->isPHI()) {
+    LLVM_DEBUG(
+        dbgs() << "\tbase reg def is not PHI or not in single-block loop\n");
+    return false;
+  }
 
-  for (auto it = MRI->use_nodbg_begin(VReg), end = MRI->use_nodbg_end();
-       it != end; ++it) {
-    MachineInstr *User = it->getParent();
-    if (User->isPHI()) {
-      unsigned BBOperandIdx = User->getOperandNo(&*it) + 1;
-      MachineBasicBlock *MBB = User->getOperand(BBOperandIdx).getMBB();
-      if (MBB->empty()) {
-        const MachineBasicBlock *InstBB = MI->getParent();
-        assert(InstBB != MBB && "Instruction found in empty MBB");
-        if (!MDT->dominates(InstBB, MBB))
-          return false;
+  if (PHI->getNumOperands() != 5) {
+    LLVM_DEBUG(dbgs() << "\twrong number of PHI's operands\n");
+    return false;
+  }
+
+  MachineInstr *Incr = Increments[0].first;
+  if (!hasOnePhiUse(Incr->getOperand(0))) {
+    LLVM_DEBUG(dbgs() << "\tincrement has more than single PHI use\n");
+    return false;
+  }
+  if (Incr != getSsaDef(PHI->getOperand(3).getReg())) {
+    return false;
+  }
+
+  unsigned BaseIdx = (Candidates[0].first->mayLoad()) ? 1 : 0;
+  unsigned OffIdx = BaseIdx + 1;
+
+  if (!Candidates[0].second.IsImm) {
+    // TODO: fix transformPhiForBetterLICM to handle multiple insns
+    if (Candidates.size() > 1)
+      return false;
+
+    bool IncrIsImm = Increments[0].second.IsImm;
+    for (auto &C : Candidates) {
+      if (C.first == Incr)
         continue;
-      }
-      User = &*MBB->rbegin();
+      if (C.second.IsImm)
+        return false;
+      if (IncrIsImm && (C.second.getScale() - 1) != 0)
+        return false; // No point doing this transformation
+    }
+    bool baseOffsetSwapped = false;
+    if (ARC::getVReg(Candidates[0].first->getOperand(BaseIdx)) != BaseReg) {
+      assert(ARC::getVReg(Candidates[0].first->getOperand(OffIdx)) == BaseReg);
+      std::swap(BaseIdx, OffIdx);
+      baseOffsetSwapped = true;
+    }
+
+    // Def of offset must dominate incoming block
+    MachineBasicBlock *IncomingMBB = PHI->getOperand(2).getMBB();
+    MachineInstr *OffDef = getSsaDef(Candidates[0].first->getOperand(OffIdx));
+    if (OffDef == nullptr)
+      return false;
+    if (!DOM.dominates(OffDef->getParent(), IncomingMBB)) {
+      return false;
     }
 
-    if (!MDT->dominates(MI, User))
+    if (!transformPhiForBetterLICM(PHI, Candidates[0].first, BaseIdx, OffIdx))
       return false;
+
+    if (baseOffsetSwapped) {
+      // Avoid nonsense like "ldub %r0,[0,%1]"
+
+      MachineInstr *Ldst = Candidates[0].first;
+      if (OffIdx + 1 == BaseIdx) { // Should always be true
+        MachineOperand base = Ldst->getOperand(BaseIdx);
+        MachineOperand off = Ldst->getOperand(OffIdx);
+        if (base.isReg() && off.isImm()) {
+          Ldst->getOperand(OffIdx).ChangeToRegister(base.getReg(), false);
+          Ldst->getOperand(OffIdx).setSubReg(base.getSubReg());
+          Ldst->getOperand(BaseIdx).ChangeToImmediate(off.getImm());
+        }
+      }
+    }
+
+    for (auto &C : Candidates) {
+      if (C.first == Incr)
+        continue;
+      unsigned NewScale = C.second.getScale() - 1;
+      if (NewScale) {
+        C.second.setScale(NewScale);
+      } else {
+        C.second.IsImm = true;
+        C.second.setImm(0);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "Transformed Candidates:\n"; for (auto &C
+                                                           : Candidates) {
+      dbgs() << Ordinals[C.first] << ": " << *C.first << "\t";
+      C.second.print(dbgs(), TRI);
+      dbgs() << "\n";
+    });
+    return true;
   }
+
+  // Immediate offset case
+
+  int64_t PrevOffset = Candidates[0].second.getImm();
+
+  if (!transformPhiForBetterLICM(PHI, Candidates[0].first, BaseIdx, OffIdx))
+    return false;
+
+  for (auto &C : Candidates) {
+    if (C.first == Incr)
+      continue;
+    int64_t NewOffset = C.second.getImm() - PrevOffset;
+    C.second.setImm(NewOffset);
+  }
+  LLVM_DEBUG(dbgs() << "Transformed Candidates:\n"; for (auto &C
+                                                         : Candidates) {
+    dbgs() << Ordinals[C.first] << ": " << *C.first << "\t";
+    C.second.print(dbgs(), TRI);
+    dbgs() << "\n";
+  });
   return true;
 }
 
-// Return true if \p MI is load/store instruction with immediate offset
-// which can be adjusted by \p Disp
-static bool isLoadStoreThatCanHandleDisplacement(const TargetInstrInfo *TII,
-                                                 const MachineInstr &MI,
-                                                 int64_t Disp) {
-  unsigned BasePos, OffPos;
-  if (!TII->getBaseAndOffsetPosition(MI, BasePos, OffPos))
-    return false;
-  const MachineOperand &MO = MI.getOperand(OffPos);
-  if (!MO.isImm())
+// (STAR 9000862576) ISel scheduler loves to swap loads from same BaseReg and
+// put one with bigger offset first:
+//      %vreg17<def> = LD_rs9 %vreg181, 68;
+//      %vreg20<def> = LD_rs9 %vreg181, 4;
+//      %vreg27<def> = LD_rs9 %vreg181, 260;
+//      %vreg50<def> = LD_rs9 %vreg181, 324;
+//      ...
+// We can generate postincrement only if two first loads may be swapped
+// It usually happens with first two loads, hence this simple ad hoc
+// implementation
+
+bool ARC::ABAW::tryToReorderCandidatesImm() {
+  bool Changed = false;
+  bool Ascending = (Increments[0].second.getImm() > 0);
+  size_t N = Candidates.size();
+  MachineBasicBlock *MBB = Candidates[0].first->getParent();
+  // We don't want to move increments, hence i < N-1 bound.
+  for (size_t i = 1; i < N - 1; ++i) {
+    MachineInstr *PrevMI = Candidates[i - 1].first;
+    MachineInstr *CurrMI = Candidates[i].first;
+    assert(PrevMI->getParent() == MBB);
+    assert(CurrMI->getParent() == MBB);
+    int64_t PrevOff = Candidates[i - 1].second.getImm();
+    int64_t CurrOff = Candidates[i].second.getImm();
+    if ((Ascending && (CurrOff >= PrevOff)) ||
+        (!Ascending && (CurrOff <= PrevOff)))
+      continue;
+    LLVM_DEBUG(dbgs() << "Found misplaced instructions in %bb."
+                      << MBB->getNumber() << ":\n"
+                      << *PrevMI << *CurrMI);
+    MachineBasicBlock::iterator it1(CurrMI);
+    ++it1;
+    MachineBasicBlock::iterator it2(PrevMI);
+    if (it1 != MBB->end() && canSinkLoadStoreTo(PrevMI, &*it1)) {
+      if (CurrentInst && CurrentInst->getNextNode() == PrevMI) {
+        LLVM_DEBUG(
+            dbgs() << "Cannot move instruction - would break iterators\n");
+        return false;
+      }
+      if (PrevMI != &*it1) {
+        PrevMI->removeFromParent();
+        MBB->insert(it1, PrevMI);
+        std::swap(Candidates[i], Candidates[i - 1]);
+        Changed = true;
+      }
+    } else if (it2 != MBB->begin() && canHoistLoadStoreTo(CurrMI, &*(--it2))) {
+      if (CurrentInst && CurrentInst->getNextNode() == CurrMI) {
+        LLVM_DEBUG(
+            dbgs() << "Cannot move instruction - would break iterators\n");
+        return false;
+      }
+      CurrMI->removeFromParent();
+      MBB->insertAfter(it2, CurrMI);
+      std::swap(Candidates[i], Candidates[i - 1]);
+      Changed = true;
+    }
+  }
+
+  LLVM_DEBUG(if (Changed) {
+    dbgs() << "Reordered Candidates:\n";
+    for (auto &C : Candidates) {
+      dbgs() << Ordinals[C.first] << ": " << *C.first << "\t";
+      C.second.print(dbgs(), TRI);
+      dbgs() << "\n";
+    }
+  });
+
+  return Changed;
+}
+
+// Change
+//   %r3 = LD_rs9 %r1, off
+//   %r2 = ADD_rru6 %r1, off
+//
+// to
+//   %r3,%r2 = LD_rs9_aw, %r1, off
+//
+bool ARC::ABAW::tryToGenPreIncImm() {
+  size_t N = Candidates.size() - 1;
+  InstIncrPair &I = Increments[0];
+  GeneratePreInc = false;
+
+  if (!isInt<9>(I.second.getImm()))
     return false;
-  int64_t Offset = MO.getImm() + Disp;
-  return isValidLoadStoreOffset(Offset);
+
+  // TODO: run this loop  backwards? We can have several eligible instrs,
+  // see v2_xy/instcombine/valgn_create.c
+  for (unsigned Pos = 0; Pos < N; ++Pos) {
+    InstIncrPair &C = Candidates[Pos];
+    if (getConversionToAW(*C.first) == 0)
+      continue;
+
+    if (C.second.getImm() == 0 || C.second.getImm() != I.second.getImm())
+      continue;
+
+    if (canJoinInstructions(C.first, I.first) != C.first)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "\tcan generate preincrement for "
+                      << *Candidates[Pos].first);
+    GeneratePreInc = true;
+    for (unsigned i = Pos + 1; i < N; ++i) {
+      ExternalDominatedUses.push_back(Candidates[i].first);
+    }
+    SmallVector<InstIncrPair, 8> Tmp(1, C);
+    Candidates.swap(Tmp);
+    return true;
+  }
+  return false;
 }
 
-bool ARCOptAddrMode::noUseOfAddBeforeLoadOrStore(const MachineInstr *Add,
-                                                 const MachineInstr *Ldst) {
-  Register R = Add->getOperand(0).getReg();
-  return dominatesAllUsesOf(Ldst, R, MDT, MRI);
+bool ARC::ABAW::checkCandidatesImm() {
+  // Would offset deltas fit into S9 immediate?
+  int64_t PrevOffset = Candidates[0].second.getImm();
+  MachineInstr *Incr = Increments[0].first;
+
+  bool allVectorLDST = false;
+  // TODO
+  // if (ST.shouldFavorPostIncForVDSP()) {
+  //   allVectorLDST = true;
+  //   for (auto &C: Candidates) {
+  //     if (C.first != Incr && !ARC::isVectorInstr(C.first->getDesc().TSFlags))
+  //     {
+  //       allVectorLDST = false;
+  //       break;
+  //     }
+  //   }
+  // }
+
+  for (auto &C : Candidates) {
+    int64_t Offset = C.second.getImm();
+    // Vector load/store supports 16-bit/32-bit LIMMs.
+    if (!allVectorLDST && !isInt<9>(Offset - PrevOffset)) {
+      // TODO: put it in register
+      LLVM_DEBUG(dbgs() << "\tcannot fix offset of " << *C.first);
+      return false;
+    }
+    if (IsOptimizeForSpace && C.first != Incr && mayHaveShortForm(*C.first) &&
+        Offset >= 0 && (Offset - PrevOffset) < 0) {
+      LLVM_DEBUG(dbgs() << "Negative offset could hurt code size\n");
+      return false;
+    }
+    PrevOffset = Offset;
+  }
+  return true;
 }
 
-MachineInstr *ARCOptAddrMode::tryToCombine(MachineInstr &Ldst) {
-  assert(Ldst.mayLoadOrStore() && "LD/ST instruction expected");
+// Reorder add and sub to improve LICM and Post-inc load/store
+// %vreg34 = PHI %vreg0, <BB#6>, %vreg41, <BB#8> -- Should have only two uses
+// %vreg66 = LDSB_rr %vreg34, %vreg1;            -- vreg1 must be loop invariant
+// %vreg41 = SUB_rru6 %vreg34, 1;                -- Should have only one use
+// -->
+// %vregxx = ADD_rrr %vreg0, %vreg1;             -- Outside the loop at end of
+// <BB#6> %vreg34 = PHI %vregxx, <BB#6>, %vreg41, <BB#8> %vreg66 = LDSB_rs9
+// %vreg34, 0; %vreg41 = SUB_rru6 %vreg34, 1;
+bool ARC::ABAW::transformPhiForBetterLICM(MachineInstr *Phi, MachineInstr *Ldst,
+                                          unsigned BaseIdx, unsigned OffIdx) {
+  if (!ExternalDominatingUses.empty())
+    return false; // TODO: possible to fix PHI use by updating BaseIdx in loop
+                  // exit block
+
+  LLVM_DEBUG(dbgs() << "[ABAW] Ready to transform with\nphi = " << *Phi
+                    << " and Ldst = " << *Ldst);
+  MachineOperand &Offset = Ldst->getOperand(OffIdx);
+  MachineBasicBlock *MBB = Phi->getOperand(2).getMBB();
+  unsigned NewReg = createVirtReg(Ldst->getOperand(BaseIdx).getReg());
+  unsigned InitReg = Phi->getOperand(1).getReg();
+  MachineInstr *NewAdd = nullptr;
+
+  if (Offset.isReg()) {
+    unsigned Opc = ARC::getConversionToRS9(Ldst->getOpcode());
+    assert(Opc != 0 && "No RS9 form for instruction");
+    Ldst->setDesc(ST.getInstrInfo()->get(Opc));
+    // TODO unsigned ADD = ST.isArc64() ? ARC::ADDL_rrr : ARC::ADD_rrr;
+    unsigned ADD = ST.isArc64() ? ARC::ADD_rrr : ARC::ADD_rrr;
+    NewAdd = BuildMI(*MBB, MBB->getFirstInstrTerminator(), Ldst->getDebugLoc(),
+                     ST.getInstrInfo()->get(ADD), NewReg)
+                 .addReg(InitReg)
+                 .addReg(Offset.getReg());
+    Offset.ChangeToImmediate(0);
 
-  unsigned BasePos, OffsetPos;
+  } else {
+    int64_t OffsetImm = Offset.getImm();
+    // Try simple constant folding
+    int64_t amount;
+    MachineInstr *InitRegDef = getSsaDef(InitReg);
+    if (ARC::isAddConstantOp(*InitRegDef, &amount)) {
+      InitReg = ARC::getVReg(InitRegDef->getOperand(1));
+      OffsetImm += amount;
+    }
 
-  LLVM_DEBUG(dbgs() << "[ABAW] tryToCombine " << Ldst);
-  if (!AII->getBaseAndOffsetPosition(Ldst, BasePos, OffsetPos)) {
-    LLVM_DEBUG(dbgs() << "[ABAW] Not a recognized load/store\n");
-    return nullptr;
+    NewAdd = ARC::buildAddByConstant(MRI, *MBB, MBB->getFirstInstrTerminator(),
+                                     Ldst->getDebugLoc(), NewReg, InitReg,
+                                     OffsetImm);
+
+    OffsetImm =
+        Offset.getImm(); // We might have spoiled OffsetImm above, get it again
+    MachineInstr *Incr = getSsaDef(Phi->getOperand(3));
+    for (MachineInstr &MI :
+         MRI.use_nodbg_instructions(Ldst->getOperand(BaseIdx).getReg())) {
+      if (&MI == Incr)
+        continue;
+      // NOTE: It is caller's responsibility to ensure that finally all
+      // instructions can handle new displacement - in the middle of
+      // transformation chain this condition temporarily can break
+      MachineOperand &O = MI.getOperand(MI.mayLoad() ? 2 : 1);
+      O.setImm(O.getImm() - OffsetImm);
+    }
   }
 
-  MachineOperand &Base = Ldst.getOperand(BasePos);
-  MachineOperand &Offset = Ldst.getOperand(OffsetPos);
+  // Update PHI's incoming value
+  Phi->getOperand(1).setReg(NewReg);
 
-  assert(Base.isReg() && "Base operand must be register");
-  if (!Offset.isImm()) {
-    LLVM_DEBUG(dbgs() << "[ABAW] Offset is not immediate\n");
-    return nullptr;
-  }
+  LLVM_DEBUG(dbgs() << "[ABAW] Transformed with\nphi = " << *Phi
+                    << " and NewAdd = " << *NewAdd << " and Ldst = " << *Ldst
+                    << "\n");
+  return true;
+}
 
-  Register B = Base.getReg();
-  if (Register::isStackSlot(B) || !Register::isVirtualRegister(B)) {
-    LLVM_DEBUG(dbgs() << "[ABAW] Base is not VReg\n");
-    return nullptr;
+static unsigned isAddRRR(unsigned Opcode) {
+  switch (Opcode) {
+    // TODO
+  // case ARC::ADDL_rrr:
+  case ARC::ADD_rrr:
+    return 1;
+  // case ARC::ADD1L_rrr:
+  case ARC::ADD1_rrr:
+    return 2;
+  // case ARC::ADD2L_rrr:
+  case ARC::ADD2_rrr:
+    return 4;
+  // case ARC::ADD3L_rrr:
+  case ARC::ADD3_rrr:
+    return 8;
   }
+  return 0;
+}
+static unsigned isAddRRR(MachineInstr &MI) { return isAddRRR(MI.getOpcode()); }
 
-  // TODO: try to generate address preincrement
-  if (Offset.getImm() != 0) {
-    LLVM_DEBUG(dbgs() << "[ABAW] Non-zero offset\n");
-    return nullptr;
+unsigned ARC::ABAW::getConversionToAB(MachineInstr &MI) const {
+  if (ARC::isVectorInstr(MI.getDesc().TSFlags)) {
+    // TODO
+    // if (!ST.shouldFavorPostIncForVDSP())
+    return 0;
   }
+  return ARC::getConversionToAB(MI.getOpcode());
+}
 
-  for (auto &Add : MRI->use_nodbg_instructions(B)) {
-    int64_t Incr;
-    if (!isAddConstantOp(Add, Incr))
-      continue;
-    if (!isValidLoadStoreOffset(Incr))
-      continue;
-
-    SmallVector<MachineInstr *, 8> Uses;
-    MachineInstr *MoveTo = canJoinInstructions(&Ldst, &Add, &Uses);
+unsigned ARC::ABAW::getVDSPAVVariant(unsigned ABOpcode) const {
+  return 0; // TODO upstream VDSP
+}
 
-    if (!MoveTo)
-      continue;
+unsigned ARC::ABAW::getConversionToAW(MachineInstr &MI) const {
+  if (ARC::isVectorInstr(MI.getDesc().TSFlags)) {
+    // TODO
+    // if (!ST.shouldFavorPostIncForVDSP())
+    return 0;
+  }
+  return ARC::getConversionToAW(MI.getOpcode());
+}
 
-    if (!canFixPastUses(Uses, Add.getOperand(2), B))
-      continue;
+void ARC::ABAW::preBlockCallout(MachineBasicBlock &MBB) {
+  computeOrdinals(MBB);
+}
 
-    LLVM_DEBUG(MachineInstr *First = &Ldst; MachineInstr *Last = &Add;
-               if (MDT->dominates(Last, First)) std::swap(First, Last);
-               dbgs() << "[ABAW] Instructions " << *First << " and " << *Last
-                      << " combined\n";
+void ARC::ABAW::postBlockCallout(MachineBasicBlock &MBB, bool result) {
+  CurrentInst = nullptr;
+  clear();
+}
 
-    );
+bool ARC::ABAW::visit(MachineInstr &MI) {
 
-    MachineInstr *Result = Ldst.getNextNode();
-    if (MoveTo == &Add) {
-      Ldst.removeFromParent();
-      Add.getParent()->insertAfter(Add.getIterator(), &Ldst);
-    }
-    if (Result == &Add)
-      Result = Result->getNextNode();
+  MachineBasicBlock *MBB = MI.getParent();
+  if (!MBB)
+    return false;
+  if (MI.isDebugValue())
+    return false;
 
-    fixPastUses(Uses, B, Incr);
+  // Identify inner, single block loops as SWP candidates and
+  // skip under vdsp-inner-post-inc=false.
+  if (const MachineLoop *ML = MLI.getLoopFor(MBB))
+    if (ML->getNumBlocks() == 1 && ML->getSubLoops().size() == 0)
+      InSWPCandidate = true;
 
-    int NewOpcode = ARC::getPostIncOpcode(Ldst.getOpcode());
-    assert(NewOpcode > 0 && "No postincrement form found");
-    unsigned NewBaseReg = Add.getOperand(0).getReg();
-    changeToAddrMode(Ldst, NewOpcode, NewBaseReg, Add.getOperand(2));
-    Add.eraseFromParent();
+  if (ARC::isVectorInstr(MI.getDesc().TSFlags) &&
+        false /*TODO (!ST.shouldFavorPostIncForVDSP() || (InSWPCandidate && VDSPInnerLoopMode == InnerLoopMode::DISABLED))*/ )
+    return false;
 
-    return Result;
+  LLVM_DEBUG(dbgs() << "[PREREGCOMBINE]>>>Visiting " << MI << "\n");
+  CurrentInst = &MI;
+  if (getConversionToAB(MI) != 0)
+    return tryToCombine(MI);
+
+  bool FoundSomething = false;
+  unsigned Reg = 0;
+  if (isAddConstantOp(MI, nullptr) || isAddRRR(MI.getOpcode())) {
+    Reg = ARC::getVReg(MI.getOperand(1));
+    if (Reg && !Register::isStackSlot(Reg))
+      FoundSomething = collectCandidates(Reg, MBB);
+    // TODO
+    if (!FoundSomething &&
+        (MI.getOpcode() ==
+         ARC::ADD_rrr /*|| MI.getOpcode() == ARC::ADDL_rrr*/)) {
+      Reg = ARC::getVReg(MI.getOperand(2));
+      if (Reg && !Register::isStackSlot(Reg))
+        FoundSomething = collectCandidates(Reg, MBB);
+    }
   }
-  return nullptr;
+  if (FoundSomething && analyzeCandidates(Reg, MBB) &&
+      validateIncrementForms()) {
+    bool transformed = transformCandidates(Reg, MBB);
+    assert(transformed && "Failed to transform load/store candidates");
+    return true;
+  }
+  return false;
 }
 
-MachineInstr *
-ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add,
-                                    SmallVectorImpl<MachineInstr *> *Uses) {
-  assert(Ldst && Add && "NULL instruction passed");
-
-  MachineInstr *First = Add;
-  MachineInstr *Last = Ldst;
-  if (MDT->dominates(Ldst, Add))
-    std::swap(First, Last);
-  else if (!MDT->dominates(Add, Ldst))
-    return nullptr;
+// Check if either one instruction can be moved (up or down) to another.
+// Returns instruction which another can be moved to (i.e., one not moved) or
+// nullptr if nothing can be moved ASSUMPTIONS:
+//   - ldst uses one of add's operands as a base
+//   - ldst and add are in the same BB
+MachineInstr *ARC::ABAW::canJoinInstructions(MachineInstr *ldst,
+                                             MachineInstr *add) {
+  assert(ldst && add && "NULL instruction passed");
 
-  LLVM_DEBUG(dbgs() << "canJoinInstructions: " << *First << *Last);
+  bool isLoad = ldst->mayLoad();
 
-  unsigned BasePos, OffPos;
+  unsigned B = ldst->getOperand(isLoad ? 1 : 0).getReg();
 
-  if (!AII->getBaseAndOffsetPosition(*Ldst, BasePos, OffPos)) {
-    LLVM_DEBUG(
-        dbgs()
-        << "[canJoinInstructions] Cannot determine base/offset position\n");
+  MachineInstr *First = add;
+  MachineInstr *Last = ldst;
+  if (DOM.dominates(ldst, add)) {
+    std::swap(First, Last);
+  } else if (!DOM.dominates(add, ldst)) {
     return nullptr;
   }
 
-  Register BaseReg = Ldst->getOperand(BasePos).getReg();
+  LLVM_DEBUG(dbgs() << "canJoinInstructions " << *First << *Last);
 
   // prohibit this:
   //   v1 = add v0, c
-  //   st v1, [v0, 0]
-  // and this
-  //   st v0, [v0, 0]
-  //   v1 = add v0, c
-  if (Ldst->mayStore() && Ldst->getOperand(0).isReg()) {
-    Register StReg = Ldst->getOperand(0).getReg();
-    if (Add->getOperand(0).getReg() == StReg || BaseReg == StReg) {
-      LLVM_DEBUG(dbgs() << "[canJoinInstructions] Store uses result of Add\n");
-      return nullptr;
-    }
+  //   st [v0, 0], v1
+  if (ldst->mayStore() && ldst->getOperand(2).isReg() &&
+      add->getOperand(0).getReg() == ldst->getOperand(2).getReg()) {
+    LLVM_DEBUG(dbgs() << "\tStore uses result of Add\n");
+    return nullptr;
   }
 
-  SmallVector<MachineInstr *, 4> UsesAfterLdst;
-  SmallVector<MachineInstr *, 4> UsesAfterAdd;
-  for (MachineInstr &MI : MRI->use_nodbg_instructions(BaseReg)) {
-    if (&MI == Ldst || &MI == Add)
+  // In any case, def of B (old base) must dominate all its uses (which means
+  // B is not used in any PHI node)
+  // Also, see if there are any uses of B after either instruction (ldst and
+  // add)
+  SmallVector<MachineInstr *, 4> usesAfterLdst;
+  SmallVector<MachineInstr *, 4> usesAfterAdd;
+  for (MachineOperand &o : MRI.use_nodbg_operands(B)) {
+    MachineInstr *MI = o.getParent();
+    if (MI == First || MI == Last)
       continue;
-    if (&MI != Add && MDT->dominates(Ldst, &MI))
-      UsesAfterLdst.push_back(&MI);
-    else if (!MDT->dominates(&MI, Ldst))
+    if (MI != add && DOM.dominates(ldst, MI))
+      usesAfterLdst.push_back(MI);
+    else if (!DOM.dominates(MI, ldst))
       return nullptr;
-    if (MDT->dominates(Add, &MI))
-      UsesAfterAdd.push_back(&MI);
+    if (DOM.dominates(add, MI))
+      usesAfterAdd.push_back(MI);
   }
 
   MachineInstr *Result = nullptr;
 
-  if (First == Add) {
+  if (First == add) {
     //  n = add b, i
     //  ...
     //  x = ld [b, o] or x = ld [n, o]
 
+    // Case 1: can we move add down to ldst?
+    // Conditions:
+    // - No uses of 'n' before ldst
+    // - b is not used in any PHI (already checked above)
+    LLVM_DEBUG(dbgs() << "\tCan move add down?...");
     if (noUseOfAddBeforeLoadOrStore(First, Last)) {
       Result = Last;
-      LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can sink Add down to Ldst\n");
-    } else if (canHoistLoadStoreTo(Ldst, Add)) {
-      Result = First;
-      LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can hoist Ldst to Add\n");
+      LLVM_DEBUG(dbgs() << "YES\n");
+    } else {
+      Result = nullptr;
+      // If both instructions are within same BB check if we can sink add down
+      // to its first use and them hoist load/store there.
+      // E.g.
+      //   %vreg134<def> = ADD_rru6_ %vreg39, -4
+      //   CMP_rr %vreg39, %vreg0, %STATUS<imp-def>
+      //   %vreg46<def,tied1> = MOVcc_rr %vreg134<tied0>, %vreg2, pred:1,
+      //   %STATUS<imp-use>
+      //   ....
+      //   %vreg137<def> = LD_rs9 %vreg39, 0
+      // It's possible to sink ADD past CMP and then hoist LD to ADD
+      if (ldst->getParent() == add->getParent()) {
+        MachineBasicBlock::iterator f(add), l(ldst);
+        unsigned R = add->getOperand(0).getReg();
+        for (; f != l; ++f) {
+          if (f->readsVirtualRegister(R))
+            break;
+        }
+        assert(f != l && "Use of add not found");
+        MachineInstr *PredMI = &*std::prev(f);
+        if (PredMI != add && canHoistLoadStoreTo(ldst, PredMI)) {
+          LLVM_DEBUG(dbgs()
+                     << "\tFound intermediate instruction to sink add to: "
+                     << *PredMI);
+          add->removeFromParent();
+          MachineBasicBlock *MBB = PredMI->getParent();
+          // Move add right before first use
+          MBB->insert(f, add);
+          Result = add;
+          // Adjust array of past uses
+          SmallVector<MachineInstr *, 4> tmp;
+          for (MachineInstr *I : usesAfterAdd) {
+            if (DOM.dominates(add, I))
+              tmp.push_back(I);
+          }
+          usesAfterAdd.swap(tmp);
+        }
+      }
+      LLVM_DEBUG(if (Result == nullptr) dbgs()
+                     << "NO (add result is used in between)\n";);
+    }
+
+    if (Result == nullptr) {
+      // Case 2: can we move ldst up to add? (longer live range)
+      LLVM_DEBUG(dbgs() << "\tCan move load/store up?...");
+      if (!canHoistLoadStoreTo(ldst, add)) {
+        Result = nullptr;
+        LLVM_DEBUG(dbgs() << "NO (memory operation in between)\n");
+      } else {
+        Result = First;
+        LLVM_DEBUG(dbgs() << "YES\n");
+      }
     }
   } else {
     // x = ld [b, o]
     // ...
     // n = add b, i
-    Result = First;
-    LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can hoist Add to Ldst\n");
+
+    if (SINK_LDST()) {
+      // Case 3: can we move ldst down to add?
+      LLVM_DEBUG(dbgs() << "\tCan move ldst down?...");
+      if (!canSinkLoadStoreTo(ldst, add)) {
+        Result = nullptr;
+      } else {
+        Result = Last;
+      }
+      LLVM_DEBUG(dbgs() << (Result ? "YES\n" : "NO\n"));
+    }
+
+    if (Result == nullptr) {
+      LLVM_DEBUG(dbgs() << "\tMoving add up\n");
+      Result = ldst;
+    }
+
+    // Final check:
+    // if add's second operand (i) is register, its def must _strictly_ dominate
+    // ld (valid for loads only)
+    unsigned A = ARC::getVReg(add->getOperand(2));
+    if (A) {
+      if (!isLoad) {
+        LLVM_DEBUG(dbgs() << " NO (store cannot handle non-literal offset\n");
+        return nullptr;
+      }
+      if (A == B)
+        A = ARC::getVReg(add->getOperand(1));
+      MachineInstr *Def = getSsaDef(A);
+      if (Def == ldst || Def == nullptr || !DOM.dominates(Def, ldst)) {
+        LLVM_DEBUG(dbgs() << " NO (offset def does not dominate load\n");
+        return nullptr;
+      }
+    }
   }
-  if (Result && Uses)
-    *Uses = (Result == Ldst) ? UsesAfterLdst : UsesAfterAdd;
+
+  // Now check that we can update all uses of b after Result (if any) to use new
+  // value
+  InstrVector &instrs = (Result == ldst) ? usesAfterLdst : usesAfterAdd;
+  if (!instrs.empty()) {
+    if (!canFixPastUses(instrs, add, B))
+      Result = nullptr;
+  }
+
+  LLVM_DEBUG(if (Result != nullptr) dbgs()
+                 << "canJoinInstructions: MoveTo: " << *Result;
+             else dbgs() << "Can not join instructions\n");
+
   return Result;
 }
 
-bool ARCOptAddrMode::canFixPastUses(const ArrayRef<MachineInstr *> &Uses,
-                                    MachineOperand &Incr, unsigned BaseReg) {
+// Check that result of 'add' is not used before 'ldst'
+bool ARC::ABAW::noUseOfAddBeforeLoadOrStore(MachineInstr *add,
+                                            MachineInstr *ldst) {
+  unsigned R = add->getOperand(0).getReg();
+  return ARC::dominatesAllUsesOf(ldst, R, &DOM, &MRI);
+}
 
-  assert(Incr.isImm() && "Expected immediate increment");
-  int64_t NewOffset = Incr.getImm();
-  for (MachineInstr *MI : Uses) {
-    int64_t Dummy;
-    if (isAddConstantOp(*MI, Dummy)) {
-      if (isValidIncrementOffset(Dummy + NewOffset))
-        continue;
-      return false;
-    }
-    if (isLoadStoreThatCanHandleDisplacement(AII, *MI, -NewOffset))
-      continue;
-    LLVM_DEBUG(dbgs() << "Instruction cannot handle displacement " << -NewOffset
-                      << ": " << *MI);
+void ARC::ABAW::setLoadStoreBaseOffset(MachineInstr &MI, unsigned NewBaseReg,
+                                       unsigned BaseReg,
+                                       const MachineOperand &NewOffset) {
+  MachineOperand Src = MachineOperand::CreateImm(0xDEADBEEF);
+  unsigned BaseIndex = 0;
+  bool IsStore = MI.mayStore();
+  if (IsStore) {
+    // Store: NewBaseReg = ST.ab [BaseReg, NewOffset], Src
+    Src = MI.getOperand(2);
+    MI.RemoveOperand(2);
+    assert(NewOffset.isImm() && "Store can only handle immediate offsets");
+  } else if (MI.getOperand(0).isReg() && MI.getOperand(0).isDef()) {
+    // Load: X, NewBaseReg = LD.ab [BaseReg, NewOffset]
+    BaseIndex = 1;
+  } else {
+    // Prefetch: NewBaseReg = PF.ab [BaseReg, NewOffset]
+    BaseIndex = 0;
+  }
+  MI.RemoveOperand(BaseIndex + 1);
+  MI.RemoveOperand(BaseIndex);
+  MI.addOperand(MachineOperand::CreateReg(NewBaseReg, true));
+  MI.addOperand(MachineOperand::CreateReg(BaseReg, false));
+  MI.addOperand(NewOffset);
+  if (IsStore)
+    MI.addOperand(Src);
+  if (!MI.getOperand(BaseIndex).isTied())
+    MI.tieOperands(BaseIndex, BaseIndex + 1);
+}
+
+void ARC::ABAW::setLoadStoreBaseOffset(MachineInstr &MI, unsigned NewBaseReg,
+                                       unsigned BaseReg, int64_t NewOffset) {
+  const MachineOperand Opnd = MachineOperand::CreateImm(NewOffset);
+  return setLoadStoreBaseOffset(MI, NewBaseReg, BaseReg, Opnd);
+}
+
+static bool updateRegIncrement(MachineInstr &Def, ARC::RegIncrement &RI) {
+  unsigned NewReg = 0;
+  switch (Def.getOpcode()) {
+  // TODO upstream
+  // case ARC::MPYL_rru6:
+  case ARC::MPY_rru6:
+    NewReg = Def.getOperand(1).getReg();
+    RI.Scale *= Def.getOperand(2).getImm();
+    break;
+  // case ARC::ASLL_rru6:
+  case ARC::ASL_rru6:
+    NewReg = Def.getOperand(1).getReg();
+    RI.Scale *= 1 << Def.getOperand(2).getImm();
+    break;
+  // case ARC::ASL1_rr:
+  //   NewReg = Def.getOperand(1).getReg();
+  //   RI.Scale *= 2;
+  //   break;
+  default:
     return false;
   }
+  assert(Def.getOperand(0).getReg() == RI.Reg && "Wrong DEF Register");
+  RI.Reg = NewReg;
   return true;
 }
 
-void ARCOptAddrMode::fixPastUses(ArrayRef<MachineInstr *> Uses,
-                                 unsigned NewBase, int64_t NewOffset) {
+// Is "MI" the result of adding a constant or register to itself?
+// If so, make the first source operand reference the PHI
+bool ARC::ABAW::isSelfIncrementing(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  // TODO upstream
+  // case ARC::ADDL_rrr:
+  case ARC::ADD_rrr: {
+    const MachineInstr *B = getSsaDef(MI.getOperand(2));
+    if (B && B->isPHI() && B->readsRegister(MI.getOperand(0).getReg())) {
+      // Swap operands so that the first source operand is the PHI
+      const MachineOperand Opd = MI.getOperand(1);
+      MI.RemoveOperand(1);
+      MI.addOperand(Opd);
+      return true;
+    }
+  }
+    LLVM_FALLTHROUGH;
+  case ARC::ADD_rru6:
+  case ARC::ADD_rrlimm:
+    // case ARC::ADDL_rru6:
+    // case ARC::ADDL_rrlimm:
+    // case ARC::SUBL_rru6:
+  case ARC::SUB_rru6: {
+    const MachineInstr *A = getSsaDef(MI.getOperand(1));
+    if (A && A->isPHI() && A->readsRegister(MI.getOperand(0).getReg()))
+      return true;
+    break;
+  }
+  default:
+    break;
+  }
+  return false;
+}
 
-  for (MachineInstr *MI : Uses) {
-    int64_t Amount;
-    unsigned BasePos, OffPos;
-    if (isAddConstantOp(*MI, Amount)) {
-      NewOffset += Amount;
-      assert(isValidIncrementOffset(NewOffset) &&
-             "New offset won't fit into ADD instr");
-      BasePos = 1;
-      OffPos = 2;
-    } else if (AII->getBaseAndOffsetPosition(*MI, BasePos, OffPos)) {
-      MachineOperand &MO = MI->getOperand(OffPos);
-      assert(MO.isImm() && "expected immediate operand");
-      NewOffset += MO.getImm();
-      assert(isValidLoadStoreOffset(NewOffset) &&
-             "New offset won't fit into LD/ST");
-    } else
-      llvm_unreachable("unexpected instruction");
-
-    MI->getOperand(BasePos).setReg(NewBase);
-    MI->getOperand(OffPos).setImm(NewOffset);
-  }
-}
-
-bool ARCOptAddrMode::canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
+// This function returns 1 for ADDs and -1 for SUBs and 0 for anything else
+static int64_t findScale(MachineInstr *MI) {
+  int64_t Value = 0;
+  if (ARC::isAddConstantOp(*MI, &Value))
+    return (Value >= 0) ? 1 : -1;
+  return 0;
+}
+
+// TODO upsteam scaled indexed addressing
+static unsigned getLoadScale(unsigned opcode, const ARCSubtarget &ST) {
+  return 0;
+}
+
+// Try to find increment instruction we can combine Ldst with
+// TODO:
+//  - port to Candidates API
+//  - base and offset registers are sometimes swapped - offset comes first
+bool ARC::ABAW::tryToCombine(MachineInstr &Ldst) {
+  bool isLoad = Ldst.mayLoad();
+  unsigned baseIndex = isLoad ? 1 : 0;
+  MachineOperand &base = Ldst.getOperand(baseIndex);
+  MachineOperand &offset = Ldst.getOperand(baseIndex + 1);
+  int64_t scale = 1;
+  unsigned B = ARC::getVReg(base);
+  if (!B || Register::isStackSlot(B))
+    return false;
+
+  if (!offset.isImm())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] Looking at ldst instruction: " << Ldst);
+
+  MachineInstr *BaseDef = getSsaDef(B);
+  if (BaseDef != nullptr && BaseDef->getParent() == Ldst.getParent()) {
+    unsigned newOp = 0;
+    switch (BaseDef->getOpcode()) {
+    // TODO: SUB_rru6, etc.
+    case ARC::ADD_rru6:
+      // case ARC::ADDL_rru6:
+      if (int32_t(offset.getImm()) ==
+          -int32_t(BaseDef->getOperand(2).getImm())) {
+        //
+        // Handle:
+        //
+        //  %2 = PHI ... %1 ...
+        //  ...
+        //  %1 = ADD_rru6 %2, 4
+        //  ...
+        //  %4 = LD_rs9 %1, -4
+        //
+        //  Convert LD to: %4, %1 = LD_rs9_ab %2, 4
+        //   and delete ADD_rru6
+        if (Ldst.getNumOperands() == 3 &&
+            !ARC::isUsedBetween(B, BaseDef, &Ldst)) {
+          newOp = ARC::getConversionToAB(Ldst.getOpcode());
+          if (newOp && isSelfIncrementing(*BaseDef))
+            break;
+          newOp = 0;
+        }
+      }
+      LLVM_FALLTHROUGH;
+    // case ARC::ADDL_rrr:
+    case ARC::ADD_rrr: {
+      // Look for:
+      //
+      //  %2 = PHI ... %1 ...
+      //  ...
+      //  %1 = ADD_rrr %2, %3
+      //  ...
+      //  %4 = LD_rs9 %1, 0
+      //
+      //  Convert LD to: %4, %1 = LD_rr_aw %2, %3
+      //   and delete ADD_rrr
+      LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] base is: " << *BaseDef);
+      if (!(Ldst.mayStore() && BaseDef->getOperand(2).isReg()) &&
+          offset.getImm() == 0 && Ldst.getNumOperands() == 3 &&
+          !ARC::isUsedBetween(B, BaseDef, &Ldst)) {
+        newOp = BaseDef->getOperand(2).isImm()
+                    ? ARC::getConversionToAW(Ldst.getOpcode())
+                    : ARC::getConversionToRRAW(Ldst.getOpcode());
+      }
+      if (newOp && isSelfIncrementing(*BaseDef))
+        break;
+      LLVM_DEBUG(if (newOp == 0) dbgs() << "[precombine] No _rr_aw form\n";
+                 else dbgs() << "[precombine] not self-referential\n");
+      newOp = 0;
+      break;
+    }
+    default:
+      break;
+    }
+    if (newOp != 0) {
+      LLVM_DEBUG(dbgs() << "[precombine] Transforming " << Ldst);
+      Ldst.setDesc(ST.getInstrInfo()->get(newOp));
+      if (!Ldst.getOperand(0).isDef()) {
+        // a store
+        // base2 = ST_rs9_aw base1, offset, src
+        MachineOperand src = Ldst.getOperand(2);
+        Ldst.RemoveOperand(2);
+        Ldst.RemoveOperand(1);
+        Ldst.RemoveOperand(0);
+        Ldst.addOperand(BaseDef->getOperand(0));
+        Ldst.addOperand(BaseDef->getOperand(1));
+        Ldst.addOperand(BaseDef->getOperand(2));
+        Ldst.addOperand(src);
+      } else {
+        // a load
+        // dest, base2 = ld_rs9_aw base1, offset
+        Ldst.RemoveOperand(2);
+        Ldst.RemoveOperand(1);
+        Ldst.addOperand(BaseDef->getOperand(0));
+        Ldst.addOperand(BaseDef->getOperand(1));
+        Ldst.addOperand(BaseDef->getOperand(2));
+      }
+      LLVM_DEBUG(dbgs() << "[precombine] to: " << Ldst
+                        << "[precombine] Deleting " << *BaseDef);
+      BaseDef->eraseFromParent();
+      return true;
+    }
+  }
+  if (offset.getImm() != 0)
+    return false;
+  for (auto UI = MRI.use_nodbg_begin(B), UE = MRI.use_nodbg_end(); UI != UE;
+       ++UI) {
+    bool isImmedOffset = true;
+    int64_t newOffset = 0;
+    MachineInstr *add = UI->getParent();
+    // XXX: It's important to call findScale, not isAddConstantOp here to skip
+    //      previously generated PSEUDO_ADD
+    // FIXME: need a better solution
+    scale = findScale(add);
+    if (!scale) {
+      isImmedOffset = false;
+      // TODO
+      if (add->getOpcode() !=
+          ARC::ADD_rrr /*&& add->getOpcode() != ARC::ADDL_rrr*/)
+        continue;
+      if (Ldst.mayStore())
+        continue;
+    } else {
+      newOffset = scale * add->getOperand(2).getImm();
+      if (!isInt<9>(newOffset))
+        continue;
+    }
+
+    MachineInstr *MoveTo = canJoinInstructions(&Ldst, add);
+    if (MoveTo == nullptr)
+      continue;
+
+    unsigned NewOpcode = isImmedOffset
+                             ? getConversionToAB(Ldst)
+                             : ARC::getConversionToRRAB(Ldst.getOpcode());
+    if (NewOpcode == 0) {
+      LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] No postincrement form found: "
+                        << Ldst << "\n");
+      return false;
+    }
+
+    MachineInstr *First = &Ldst;
+    MachineInstr *Last = add;
+    if (DOM.dominates(Last, First)) {
+      std::swap(First, Last);
+    }
+
+    LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] Instructions " << *First << " and "
+                      << *Last << " combined\n");
+
+    // TODO upstream GEN_PSEUDO_INC()
+
+    if (MoveTo == add) {
+      Ldst.removeFromParent();
+      add->getParent()->insertAfter(add->getIterator(), &Ldst);
+    }
+    toBeDeleted(add);
+
+    // Create new vreg instead of reusing add's destination to maintain SSA form
+    // (as we do not immediately delete add)
+    // Needs to be same register class as result of add.
+    unsigned NewBaseReg = createVirtReg(add->getOperand(0).getReg());
+    Ldst.setDesc(ST.getInstrInfo()->get(NewOpcode));
+
+    if (isImmedOffset) {
+      fixIntermediates(&Ldst, add->getOperand(0).getReg(), newOffset, B);
+      setLoadStoreBaseOffset(Ldst, NewBaseReg, B, newOffset);
+    } else {
+      assert(!Ldst.mayStore() &&
+             "Unexpected Store when combining with ADD_rrr");
+      unsigned BaseIdx = 1, OffIdx = 2;
+      if (ARC::getVReg(add->getOperand(OffIdx)) == B)
+        std::swap(BaseIdx, OffIdx);
+
+      assert(ARC::getVReg(add->getOperand(BaseIdx)) == B &&
+             "Base register mismatch");
+      MachineOperand &Incr = add->getOperand(OffIdx);
+      fixIntermediatesReg(&Ldst, add->getOperand(0).getReg(), Incr.getReg(), 1,
+                          B);
+      setLoadStoreBaseOffset(Ldst, NewBaseReg, B, Incr);
+      tiedRegisterMap[B] = {NewBaseReg, Incr.getReg(), &Ldst};
+    }
+
+    ARC::replaceAllUsesWith(MRI, add->getOperand(0).getReg(), NewBaseReg);
+
+    LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] to form " << Ldst << "\n");
+    return true;
+  }
+  return false;
+}
+
+// Check if load/store instruction 'Ldst' can be hoisted up to instruction 'To'
+// Conditions:
+// - both instructions are in the same BB
+// - there are no instructions with unknown side effects in between
+// - there are no stores (if Ldst is a load) or loads and stores (if Ldst is a
+// store)
+//   in between
+// - If Ldst is a store, DEF of its value operand must dominate instruction To
+// - If Ldst is a load with register offset, DEF of offset must dominate To
+bool ARC::ABAW::canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
   if (Ldst->getParent() != To->getParent())
     return false;
   MachineBasicBlock::const_iterator MI(To), ME(Ldst),
@@ -417,111 +1353,1029 @@
   for (auto &O : Ldst->explicit_operands()) {
     if (!O.isReg() || !O.isUse())
       continue;
-    MachineInstr *OpDef = MRI->getVRegDef(O.getReg());
-    if (!OpDef || !MDT->dominates(OpDef, To))
+    MachineInstr *OpDef = getSsaDef(O);
+    if (!OpDef || !DOM.dominates(OpDef, To))
       return false;
   }
   return true;
 }
 
-// bool ARCOptAddrMode::canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
-//   // Can only sink load/store within same BB
-//   if (Ldst->getParent() != To->getParent())
-//     return false;
-//   MachineBasicBlock::const_iterator MI(Ldst), ME(To),
-//       End(Ldst->getParent()->end());
-
-//   bool IsStore = Ldst->mayStore();
-//   bool IsLoad = Ldst->mayLoad();
-
-//   Register ValReg = IsLoad ? Ldst->getOperand(0).getReg() : Register();
-//   for (; MI != ME && MI != End; ++MI) {
-//     if (MI->isDebugValue())
-//       continue;
-//     if (MI->mayStore() || MI->isCall() || MI->isInlineAsm() ||
-//         MI->hasUnmodeledSideEffects())
-//       return false;
-//     if (IsStore && MI->mayLoad())
-//       return false;
-//     if (ValReg && MI->readsVirtualRegister(ValReg))
-//       return false;
-//   }
-//   return true;
-// }
-
-void ARCOptAddrMode::changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode,
-                                      unsigned NewBase,
-                                      MachineOperand &NewOffset) {
-  bool IsStore = Ldst.mayStore();
-  unsigned BasePos, OffPos;
-  MachineOperand Src = MachineOperand::CreateImm(0xDEADBEEF);
-  AII->getBaseAndOffsetPosition(Ldst, BasePos, OffPos);
+// Check if load/store instruction 'Ldst' can be sunk down to instruction 'To'
+// Conditions:
+// - both instructions are in the same BB
+// - there are no instructions with unknown side effects in between
+// - there are no stores (if Ldst is a load) or loads and stores (if Ldst is a
+// store)
+//   in between
+// - If Ldst is a load, its result must not be used before 'To'
+bool ARC::ABAW::canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
+  if (Ldst->getParent() != To->getParent())
+    return false;
+  MachineBasicBlock::const_iterator MI(Ldst), ME(To),
+      End(Ldst->getParent()->end());
 
-  Register BaseReg = Ldst.getOperand(BasePos).getReg();
+  // PREFETCH is neither one
+  bool IsStore = Ldst->mayStore();
+  bool IsLoad = Ldst->mayLoad();
 
-  Ldst.RemoveOperand(OffPos);
-  Ldst.RemoveOperand(BasePos);
+  if (IsLoad) {
+    unsigned ValReg = Ldst->getOperand(0).getReg();
+    if (ARC::isUsedBetween(ValReg, MI, ME, /*isExclusiveMI*/ false))
+      return false;
+  }
 
-  if (IsStore) {
-    Src = Ldst.getOperand(BasePos - 1);
-    Ldst.RemoveOperand(BasePos - 1);
+  for (; MI != ME && MI != End; ++MI) {
+    if (MI->isDebugValue())
+      continue;
+    if (MI->mayStore() || MI->isCall() || MI->isInlineAsm() ||
+        MI->hasUnmodeledSideEffects())
+      return false;
+    if (IsStore && MI->mayLoad())
+      return false;
   }
 
-  Ldst.setDesc(AST->getInstrInfo()->get(NewOpcode));
-  Ldst.addOperand(MachineOperand::CreateReg(NewBase, true));
-  if (IsStore)
-    Ldst.addOperand(Src);
-  Ldst.addOperand(MachineOperand::CreateReg(BaseReg, false));
-  Ldst.addOperand(NewOffset);
-  LLVM_DEBUG(dbgs() << "[ABAW] New Ldst: " << Ldst);
+  return true;
 }
 
-bool ARCOptAddrMode::processBasicBlock(MachineBasicBlock &MBB) {
-  bool Changed = false;
-  for (auto MI = MBB.begin(), ME = MBB.end(); MI != ME; ++MI) {
-    if (MI->isDebugValue())
+// Check if we can adjust all instructions (after ldst) using BaseReg to handle
+// new base value (as incremented by Incr)
+bool ARC::ABAW::canFixPastUses(const SmallVectorImpl<MachineInstr *> &Uses,
+                               MachineInstr *Incr, unsigned BaseReg) {
+
+  MachineOperand &O = Incr->getOperand(2);
+  if (O.isImm()) {
+    int64_t NewOffset = O.getImm();
+    for (MachineInstr *MI : Uses) {
+      if (isAddConstantOp(*MI, nullptr))
+        continue;
+      if (!ARC::isLoadStoreThatCanHandleDisplacement(*MI, -NewOffset)) {
+        LLVM_DEBUG(dbgs() << "\tInstruction cannot handle displacement "
+                          << -NewOffset << ": " << *MI);
+        return false;
+        ;
+      }
+
+      if (IsOptimizeForSpace) {
+        // unsigned Opcode = MI->getOpcode();
+        if (mayHaveShortForm(*MI) /*TODO && Opcode != ARC::PREFETCH_rs9*/) {
+          // In ideal case, all instructions can be transformed to short
+          // (2 byte forms). But load/stores with postincrement or negative
+          // offsets have no short forms, so they're 4 bytes.
+          // Merging increment into load/store does not give a win for code size
+          // (2+2 == 4+0), so any change to negative offset will grow size.
+          unsigned offsetIdx = MI->mayLoad() ? 2 : 1;
+          int64_t Offset = MI->getOperand(offsetIdx).getImm();
+          if (Offset >= 0 && (Offset - NewOffset) < 0) {
+            LLVM_DEBUG(dbgs() << "\tNegative offset may hurt code size\n");
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  }
+
+  // Offset is register
+  // Operands of ADD must be Incr's base register and addend register:
+  //    %vreg10<def> = ADD_rrr %vreg9, %vreg2
+  //    LD/ST [%vreg9, %vreg2]
+  //    %vreg11<def> = ADD2_rrr %vreg9, %vreg2  <- We're seeking to update this
+  unsigned AddendReg = O.getReg();
+  if (AddendReg == BaseReg)
+    AddendReg = Incr->getOperand(1).getReg();
+  for (MachineInstr *MI : Uses) {
+    switch (MI->getOpcode()) {
+    // TODO upstream
+    // case ARC::ADD1L_rrr:
+    // case ARC::ADD2L_rrr:
+    // case ARC::ADD3L_rrr:
+    case ARC::ADD1_rrr:
+    case ARC::ADD2_rrr:
+    case ARC::ADD3_rrr:
+      if (ST.isDSPv1() || !ST.hasMpy()) {
+        LLVM_DEBUG(dbgs() << "\tTarget has no MPY insn\n");
+        return false;
+      }
+      LLVM_FALLTHROUGH;
+    // case ARC::ADDL_rrr:
+    case ARC::ADD_rrr: {
+      unsigned R = ARC::getVReg(MI->getOperand(2));
+      if (!R)
+        break;
+      if (ARC::getVReg(MI->getOperand(1)) == BaseReg && R == AddendReg)
+        continue;
+      // TODO
+      if (MI->getOpcode() ==
+          ARC::ADD_rrr /*|| MI->getOpcode() == ARC::ADDL_rrr*/) {
+        // Previous transformation could have changed reg1 = ADD2 reg2, reg3
+        // to reg4 = MPY reg3, 3; reg1 = ADD reg2, reg4
+        MachineInstr *AddendDef = getSsaDef(R);
+        if (!AddendDef || (AddendDef->getOpcode() != ARC::MPY_rru6 /* TODO: && AddendDef->getOpcode() != ARC::MPYL_rru6*/))
+          break;
+        if (ARC::getVReg(AddendDef->getOperand(1)) == AddendReg)
+          continue;
+      }
+    } break;
+    default:
+      if (ARC::getConversionToRS9(MI->getOpcode())) {
+        unsigned R = ARC::getVReg(MI->getOperand(MI->mayLoad() ? 2 : 1));
+        if (R == AddendReg)
+          continue;
+      }
+    }
+    LLVM_DEBUG(
+        dbgs()
+        << "\tOffset is register and there are uses of base after ld/add\n");
+    return false;
+  }
+  return true;
+}
+
+// Adjust instructions past ldst using 'Base' to accomodate register addendum
+// 'Addend'
+void ARC::ABAW::fixIntermediatesReg(MachineInstr *ldst, unsigned newBase,
+                                    unsigned Addend, unsigned Scale,
+                                    unsigned Base) {
+  LLVM_DEBUG(dbgs() << "fixIntermediatesReg: change " << printReg(Base, TRI)
+                    << " to " << printReg(newBase, TRI) << " with addend "
+                    << printReg(Addend, TRI) << " * " << Scale << " after "
+                    << *ldst);
+  for (auto UI = MRI.use_nodbg_begin(Base), UE = MRI.use_nodbg_end(); UI != UE;
+       ++UI) {
+    MachineInstr *MI = UI->getParent();
+    if (MI->getOpcode() == ARC::TBD)
       continue;
-    if (!MI->mayLoad() && !MI->mayStore())
+    if (ldst == MI || DOM.dominates(MI, ldst))
       continue;
-    if (ARC::getPostIncOpcode(MI->getOpcode()) < 0)
+    LLVM_DEBUG(dbgs() << "fixIntermediatesReg: " << *MI);
+    unsigned Opcode = MI->getOpcode();
+    unsigned RS9 = ARC::getConversionToRS9(Opcode);
+    // canFixPastUses and similar checks ensure that MI is either Load or Add
+    // constant
+    if (RS9 != 0) {
+      // ld_rr [Base, Addend] -> ld_rs9 [NewBase, 0]
+      MI->setDesc(ST.getInstrInfo()->get(RS9));
+      MI->getOperand(MI->mayLoad() ? 1 : 0).setReg(newBase);
+      MI->getOperand(MI->mayLoad() ? 2 : 1).ChangeToImmediate(0);
+      LLVM_DEBUG(dbgs() << "Changed to " << *MI);
       continue;
-    MachineInstr *Res = tryToCombine(*MI);
-    if (Res) {
-      Changed = true;
-      // Res points to the next instruction. Rewind to process it
-      MI = std::prev(Res->getIterator());
+    }
+
+    // Perform 'poor man' copy propagation on tied registers
+    bool done = false;
+    unsigned AddScale = isAddRRR(Opcode);
+    assert(AddScale && "Unexpected opcode");
+
+    Register OffReg = MI->getOperand(2).getReg();
+    if (OffReg != Addend) {
+      // Only possible if on previous iteration we changed e.g.
+      // ADD2 base, addend; to tmp = MPY addend, 3; ADD base, tmp;
+      assert(MRI.hasOneNonDBGUser(OffReg));
+      MachineInstr *OffDef = getSsaDef(OffReg); // %tmp
+      assert(
+          OffDef &&
+          (OffDef->getOpcode() ==
+           ARC::MPY_rru6 /* TODO: || OffDef->getOpcode() == ARC::MPYL_rru6*/));
+      assert(OffDef->getOperand(1).getReg() == Addend);
+      int64_t Factor = OffDef->getOperand(2).getImm() - Scale; // N - Scale
+      assert(Factor >= 0);
+      // Change
+      // %newBase = %Base + %Addend * Scale
+      // ...
+      // %tmp = MPY %Addend, N    ; %tmp = %Addend * N
+      // %foo = ADD %Base, %tmp   ; %foo = %Base + %Addend * N
+      if (Factor == 0) { // N == Scale
+        // to:
+        // %foo = copy %newBase
+        OffDef->eraseFromParent();
+        MI->setDesc(ST.getInstrInfo()->get(ARC::COPY));
+        MI->getOperand(1).setReg(newBase);
+        MI->RemoveOperand(2);
+        LLVM_DEBUG(dbgs() << "Changed to " << *MI);
+      } else if (Factor > 0) { // N > Scale
+        // to:
+        // %tmp = MPY %Addend, (N - Scale)  ; %tmp = %Addend * (N - Scale)
+        // %foo = ADD %newBase, %tmp        ; %foo = %newBase + %Addend * (N -
+        // Scale)
+        //                                  ;      = %Base + %Addend * N
+        OffDef->getOperand(2).setImm(Factor);
+        MI->getOperand(1).setReg(newBase);
+        LLVM_DEBUG(dbgs() << "Changed to " << *OffDef << *MI);
+      }
+      continue;
+    }
+
+    do {
+      AddScale -= Scale;
+      done = (AddScale == 0);
+
+      if (!done) {
+        done = true;
+        auto search = tiedRegisterMap.find(newBase);
+        if (search != tiedRegisterMap.end()) {
+          TiedRegIncrement i = search->second;
+          if (!DOM.dominates(i.def, MI)) {
+            break; // Unusable
+          }
+          if (i.incReg == Addend) {
+            LLVM_DEBUG(dbgs() << "\tFound map " << printReg(newBase, TRI)
+                              << " --> " << printReg(i.newReg, TRI));
+            newBase = i.newReg;
+            done = false;
+          }
+        }
+      }
+    } while (!done);
+
+    if (AddScale == 0) {
+      // TODO: if it is safe to remove this instruction here, then we can:
+      // ARC::replaceAllUsesWith(MRI, MI->getOperand(0).getReg(), newBase);
+
+      // ADD_rrr Base, Addend -> ADD_rs9 NewBase, 0 (NOP, in fact)
+      // TODO
+      // unsigned ADD = ST.isArc64() ? ARC::ADDL_rru6 : ARC::ADD_rru6;
+      unsigned ADD = ST.isArc64() ? ARC::ADD_rru6 : ARC::ADD_rru6;
+      MI->setDesc(ST.getInstrInfo()->get(ADD));
+      MI->getOperand(1).setReg(newBase);
+      MI->getOperand(2).ChangeToImmediate(0);
+      LLVM_DEBUG(dbgs() << "Changed to " << *MI);
+      continue;
+    }
+
+    unsigned BaseIdx = 1;
+    if (ARC::getVReg(MI->getOperand(BaseIdx)) == Addend)
+      BaseIdx = 2;
+
+    if (AddScale == 2 || AddScale == 4 || AddScale == 8) {
+      static const unsigned op[2][3] = {
+          {ARC::ADD1_rrr, ARC::ADD2_rrr, ARC::ADD3_rrr}
+          // TODO: {ARC::ADD1L_rrr,ARC::ADD2L_rrr,ARC::ADD3L_rrr}
+      };
+      unsigned newOpcode = op[ST.isArc64()][AddScale >> 2];
+
+      // Add's operands can be swapped:
+      // %vreg98<def> = LD_rs9 %vreg2, 0
+      // %vreg10<def> = ADD_rrr %vreg9, %vreg2
+      MI->setDesc(ST.getInstrInfo()->get(newOpcode));
+      MI->getOperand(BaseIdx).setReg(newBase);
+      LLVM_DEBUG(dbgs() << "Changed to " << *MI);
+    } else {
+      // Max scale is 8 (ADD3).
+      // Since we decremented it at least once, it must be odd or 6 here
+      assert((AddScale & 1) || AddScale == 6);
+      unsigned R = createVirtReg(Addend);
+      bool mpyOK;
+      if (ST.isArc64()) {
+        mpyOK = ST.hasMpy64();
+      } else {
+        mpyOK = ST.hasMpy() && !ST.isDSPv1();
+      }
+      MachineInstr *NewMI = nullptr;
+      // TODO
+      // unsigned ADD = ST.isArc64() ? ARC::ADDL_rrr : ARC::ADD_rrr;
+      unsigned ADD = ST.isArc64() ? ARC::ADD_rrr : ARC::ADD_rrr;
+      if (mpyOK) {
+        // unsigned MPY = ST.isArc64() ? ARC::MPYL_rru6 : ARC::MPY_rru6;
+        unsigned MPY = ST.isArc64() ? ARC::MPY_rru6 : ARC::MPY_rru6;
+        NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                        ST.getInstrInfo()->get(MPY), R)
+                    .addReg(Addend)
+                    .addImm(AddScale);
+      } else if (AddScale == 3 || AddScale == 5) {
+        unsigned ADD_SCALED;
+        if (AddScale == 3) {
+          // ADD_SCALED = ST.isArc64() ? ARC::ADD1L_rrr : ARC::ADD1_rrr;
+          ADD_SCALED = ST.isArc64() ? ARC::ADD1_rrr : ARC::ADD1_rrr;
+        } else {
+          // ADD_SCALED = ST.isArc64() ? ARC::ADD2L_rrr : ARC::ADD2_rrr;
+          ADD_SCALED = ST.isArc64() ? ARC::ADD2_rrr : ARC::ADD2_rrr;
+        }
+        NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                        ST.getInstrInfo()->get(ADD_SCALED), R)
+                    .addReg(Addend)
+                    .addImm(Addend);
+      } else if (AddScale == 6) {
+        // unsigned SHL = ST.isArc64() ? ARC::ASLL_rru6 : ARC::ASL_rru6;
+        unsigned SHL = ST.isArc64() ? ARC::ASL_rru6 : ARC::ASL_rru6;
+        unsigned T1 = createVirtReg(Addend);
+        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                ST.getInstrInfo()->get(SHL), T1)
+            .addReg(Addend)
+            .addImm(2);
+        // unsigned ADD_SCALED = ST.isArc64() ? ARC::ADD1L_rrr : ARC::ADD1_rrr;
+        unsigned ADD_SCALED = ST.isArc64() ? ARC::ADD1_rrr : ARC::ADD1_rrr;
+        NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                        ST.getInstrInfo()->get(ADD_SCALED), R)
+                    .addReg(T1)
+                    .addImm(Addend);
+      } else {
+        assert(AddScale == 7);
+        // unsigned SHL = ST.isArc64() ? ARC::ASLL_rru6 : ARC::ASL_rru6;
+        unsigned SHL = ST.isArc64() ? ARC::ASL_rru6 : ARC::ASL_rru6;
+        unsigned T = createVirtReg(Addend);
+        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                ST.getInstrInfo()->get(SHL), T)
+            .addReg(Addend)
+            .addImm(3);
+        // unsigned SUB = ST.isArc64() ? ARC::SUBL_rru6 : ARC::SUB_rru6;
+        unsigned SUB = ST.isArc64() ? ARC::SUB_rru6 : ARC::SUB_rru6;
+        NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                        ST.getInstrInfo()->get(SUB), R)
+                    .addImm(T)
+                    .addReg(Addend);
+      }
+      MI->setDesc(ST.getInstrInfo()->get(ADD));
+      MI->getOperand(BaseIdx).setReg(newBase);
+      MI->getOperand((BaseIdx == 1) ? 2 : 1).setReg(R);
+      LLVM_DEBUG(dbgs() << "Changed to " << *NewMI << *MI);
     }
   }
-  return Changed;
 }
 
-bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()) || KILL_PASS())
+// Adjust add/load/store instructions (having 'base' as base operand) after ldst
+// by delta 'newOffset'
+void ARC::ABAW::fixIntermediates(MachineInstr *ldst, unsigned newBase,
+                                 int64_t newOffset, unsigned oldBase) {
+
+  MachineRegisterInfo::use_nodbg_iterator UINext;
+  for (auto UI = MRI.use_nodbg_begin(oldBase), UE = MRI.use_nodbg_end();
+       UI != UE; UI = UINext) {
+    MachineInstr *MI = UI->getParent();
+    UINext = ++UI;
+    if (MI->getOpcode() == ARC::TBD)
+      continue;
+    if (DOM.dominates(MI, ldst) || (ldst == MI)) {
+      continue;
+    }
+    int64_t amount;
+    if (isAddConstantOp(*MI, &amount)) {
+      LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] Add constant to be adjusted by "
+                        << newOffset << ": " << *MI);
+      MachineBasicBlock::iterator InsertPoint(MI);
+      MachineInstr *NewAdd = ARC::buildAddByConstant(
+          MRI, *(MI->getParent()), InsertPoint, MI->getDebugLoc(),
+          MI->getOperand(0).getReg(), newBase, amount - newOffset);
+      toBeDeleted(MI);
+      LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] ... to produce: " << *NewAdd);
+      continue;
+    }
+
+    unsigned baseIdx = 0, offsetIdx = 1;
+    // unsigned Opcode = MI->getOpcode();
+    //  getBaseAndOffsetPosition does not work for PREFETCH
+    //  neither does it works for LDD/STD
+
+    // PREFETCH does not have MayLoad attribute in .td
+    if (MI->mayLoad()) {
+      baseIdx = 1;
+      offsetIdx = 2;
+    }
+
+    if (MI->getOperand(baseIdx).isReg() &&
+        (MI->getOperand(baseIdx).getReg() == oldBase)) {
+      assert(MI->getOperand(offsetIdx).isImm());
+      LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] Offset to be adjusted by "
+                        << newOffset << ": " << *MI);
+      MI->getOperand(offsetIdx).setImm(MI->getOperand(offsetIdx).getImm() -
+                                       newOffset);
+      MI->getOperand(baseIdx).setReg(newBase);
+      LLVM_DEBUG(dbgs() << "[PREREGCOMBINE] ... to produce: " << *MI);
+    } else {
+      LLVM_DEBUG(if (!MI->getOperand(baseIdx).isReg()) dbgs()
+                     << "Operand at " << baseIdx << " is not a register in "
+                     << *MI;
+                 else dbgs() << "ERROR: Ldst base = " << printReg(oldBase, TRI)
+                             << " does not match operand at pos " << baseIdx
+                             << " in " << *MI);
+    }
+  }
+}
+
+void ARC::ABAW::clear() {
+  Candidates.clear();
+  Increments.clear();
+  ExternalDominatedUses.clear();
+  ExternalDominatingUses.clear();
+  GeneratePreInc = false;
+}
+
+bool ARC::ABAW::collectCandidates(unsigned BaseReg, MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "collectCandidates: BaseReg = " << printReg(BaseReg, TRI)
+                    << "\n");
+
+  clear();
+
+  // Keep track of vector candidates in inner loops. only
+  // permit a single vector candidate under -vdsp-inner-post-inc=single.
+  bool seenVectorLDST = false;
+  const bool limitVectorLDST =
+      InSWPCandidate &&
+      false /*TODO VDSPInnerLoopMode == InnerLoopMode::SINGLE*/;
+  const bool disableVectorLDST =
+      InSWPCandidate &&
+      false /*TODO VDSPInnerLoopMode == InnerLoopMode::DISABLED*/;
+
+  for (MachineInstr &MI : MRI.use_nodbg_instructions(BaseReg)) {
+    MachineBasicBlock *B = MI.getParent();
+    if (B != MBB) {
+
+      if (DOM.dominates(MBB, B))
+        ExternalDominatedUses.push_back(&MI);
+      else if (DOM.dominates(B, MBB))
+        ExternalDominatingUses.push_back(&MI);
+      else
+        return false;
+
+      continue;
+    }
+    bool isVector = ARC::isVectorInstr(MI.getDesc().TSFlags);
+    int64_t amount;
+    if (getConversionToAB(MI) != 0 ||
+        ARC::getConversionFromRRToRRAB(MI.getOpcode()) != 0) {
+      unsigned BaseIdx = MI.mayLoad() ? 1 : 0;
+      unsigned OffIdx = BaseIdx + 1;
+      if (ARC::getVReg(MI.getOperand(BaseIdx)) != BaseReg) {
+        // LD %off, %base
+        if (ARC::getVReg(MI.getOperand(OffIdx)) == BaseReg)
+          std::swap(BaseIdx, OffIdx);
+        else
+          return false;
+      }
+      MachineOperand &Off = MI.getOperand(OffIdx);
+      if (MI.getDesc().mayStore()) { // ignore AGU XY operands
+        // FIXME:
+        //   1) verify other similar cases with unexpected duplicate registers
+        //   (Offset?) 2) try to optimize instead of skipping
+        //
+        // STAR 9001396665
+        //   st_s	%r0,[%r0,12]
+        //   st_s	%r0,[%r0,8]
+        //   add_s	%r0,%r0,16
+
+        unsigned SrcIdx = OffIdx + 1;
+        if (SrcIdx < MI.getNumExplicitOperands()) {
+          MachineOperand &Src = MI.getOperand(SrcIdx);
+          if (Src.isReg() && Src.getReg() == BaseReg) {
+            LLVM_DEBUG(
+                dbgs()
+                << "Can't optimize: STORE Source register matches BaseReg\n");
+            return false;
+          }
+        }
+      }
+      if (Off.isImm()) {
+        if (!(isVector &&
+              ((limitVectorLDST && seenVectorLDST) || disableVectorLDST))) {
+          BaseIncr BI(Off.getImm());
+          Candidates.push_back(std::make_pair(&MI, BI));
+          seenVectorLDST |= isVector;
+        }
+      } else if (unsigned Reg = ARC::getVReg(Off)) {
+        if (!(isVector &&
+              ((limitVectorLDST && seenVectorLDST) || disableVectorLDST))) {
+          BaseIncr BI(Reg, 1);
+          Candidates.push_back(std::make_pair(&MI, BI));
+          seenVectorLDST |= isVector;
+        }
+      } else {
+        return false;
+      }
+    } else if (unsigned Scale = getLoadScale(MI.getOpcode(), ST)) {
+      unsigned OffIdx = MI.mayLoad() ? 2 : 1;
+      BaseIncr BI(MI.getOperand(OffIdx).getReg(), Scale);
+      if (BI.getReg() == BaseReg)
+        return false; // Scaled loads are not commutative
+      if (!(isVector &&
+            ((limitVectorLDST && seenVectorLDST) || disableVectorLDST))) {
+        Candidates.push_back(std::make_pair(&MI, BI));
+        seenVectorLDST |= isVector;
+      }
+    } else if (isAddConstantOp(MI, &amount)) {
+      BaseIncr BI((int64_t)amount);
+      Increments.push_back(std::make_pair(&MI, BI));
+    } else if (unsigned Scale = isAddRRR(MI)) {
+      unsigned OffReg = ARC::getVReg(MI.getOperand(2));
+      if (OffReg == BaseReg) {
+        // TODO
+        if (MI.getOpcode() !=
+            ARC::ADD_rrr /*&& MI.getOpcode() != ARC::ADDL_rrr*/)
+          return false;
+        OffReg = ARC::getVReg(MI.getOperand(1));
+      }
+      if (OffReg == 0) {
+        // Bail out on physical registers
+        return false;
+      }
+      BaseIncr BI(OffReg, Scale);
+      Increments.push_back(std::make_pair(&MI, BI));
+    } else {
+      return false;
+    }
+  }
+
+  if (Candidates.empty()) {
+    LLVM_DEBUG(dbgs() << "\tno candidate loads found\n");
     return false;
+  }
 
-  if (DUMP_BEFORE())
-    MF.dump();
-  if (VIEW_BEFORE())
-    MF.viewCFG();
+  // Make sure instructions are lexically ordered
+  std::sort(Candidates.begin(), Candidates.end(),
+            [this](InstIncrPair a, InstIncrPair b) {
+              return Ordinals[a.first] < Ordinals[b.first];
+            });
+  // Use lambda so clang-format can do a decent job with it
+  auto dumpIt = [&]() {
+    dbgs() << "Candidates:\n";
+    for (auto &C : Candidates) {
+      dbgs() << Ordinals[C.first] << ": " << *C.first << "\t";
+      C.second.print(dbgs(), TRI);
+      dbgs() << "\n";
+    }
+    dbgs() << "Increments:\n";
+    for (auto &I : Increments) {
+      dbgs() << Ordinals[I.first] << ": " << *I.first << "\t";
+      I.second.print(dbgs(), TRI);
+      dbgs() << "\n";
+    }
+    dbgs() << "ExternalDominatedUses:\n";
+    for (auto &EU : ExternalDominatedUses) {
+      dbgs() << *EU;
+    }
+    dbgs() << "ExternalDominatingUses:\n";
+    for (auto &EU : ExternalDominatingUses) {
+      dbgs() << *EU;
+    }
+  };
+  LLVM_DEBUG(dumpIt());
+  return true;
+}
 
-  AST = &MF.getSubtarget<ARCSubtarget>();
-  AII = AST->getInstrInfo();
-  MRI = &MF.getRegInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+bool ARC::ABAW::analyzeCandidates(unsigned BaseReg, MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "analyzeCandidates: BaseReg = " << printReg(BaseReg, TRI)
+                    << "\n");
 
-  bool Changed = false;
-  for (auto &MBB : MF)
-    Changed |= processBasicBlock(MBB);
+  // TODO: eh? try to combine to single increment?
+  if (Increments.size() > 1) {
+    LLVM_DEBUG(dbgs() << "\tmultiple increments found; not yet implemented\n");
+    return false;
+  }
 
-  if (DUMP_AFTER())
-    MF.dump();
-  if (VIEW_AFTER())
-    MF.viewCFG();
-  return Changed;
+  if (Increments.empty()) {
+    // TODO: Increment might be found in ExternalDominatedUses.
+    // Consider loop with if statement with load and increment in different
+    // blocks
+    LLVM_DEBUG(
+        dbgs() << "\tno increments found in block; not yet implemented\n");
+    return false;
+  }
+
+  bool isAllImm = Increments.back().second.IsImm;
+  bool isAllReg = !isAllImm;
+  bool seenStore = false;
+  unsigned R = Increments.back().first->getOperand(0).getReg();
+  for (auto &C : Candidates) {
+    bool IsImm = C.second.IsImm;
+    // LD_rs9, %Base, 0 works for both cases
+    isAllImm &= IsImm;
+    isAllReg &= (!IsImm || C.second.getImm() == 0);
+    if (!IsImm && !Register::isVirtualRegister(C.second.getReg()))
+      return false;
+    if (C.first->mayStore()) {
+      unsigned ValReg = ARC::getVReg(C.first->getOperand(2));
+      // prohibit this:
+      //   v1 = add v0, c
+      //   st [v0, 0], v1
+      if (ValReg == R)
+        return false;
+      seenStore = true;
+    }
+  }
+
+  if (!isAllImm && !isAllReg) {
+    if (Candidates.size() == 1 && (!seenStore || Increments[0].second.IsImm) &&
+        tryToTransformPHI(BaseReg, MBB)) {
+      isAllImm = Increments[0].second.IsImm;
+      isAllReg = !isAllImm;
+    } else {
+      LLVM_DEBUG(dbgs() << "\tirregular offsets detected\n");
+      return false;
+    }
+  }
+  if (!isAllImm && seenStore) {
+    LLVM_DEBUG(dbgs() << "\tstore instruction cannot handle register offset\n");
+    return false;
+  }
+
+  MachineInstr *Incr = Increments.back().first;
+
+  // Can we sink increment past last use?
+  MachineInstr *Last = Candidates.back().first;
+  if (Ordinals[Incr] < Ordinals[Last]) {
+    if (noUseOfAddBeforeLoadOrStore(Incr, Last)) {
+      // EMPTY
+    } else if (canHoistLoadStoreTo(Last, Incr) && Candidates.size() == 1) {
+      // TODO: can we handle Candidates.size() > 1 case?
+      Last->removeFromParent();
+      MBB->insertAfter(Incr, Last);
+      LLVM_DEBUG(dbgs() << "\tinstruction  " << *Last << "\twas moved to "
+                        << *Incr);
+    } else {
+      // TODO: Add everything after increment to ExternalDominatedUses and try
+      // to combine
+      LLVM_DEBUG(dbgs() << "\tcannot sink increment after last use\n");
+      return false;
+    }
+  }
+
+  // Append increments to the candidate list for simpler analysis
+  Candidates.insert(Candidates.end(), Increments.begin(), Increments.end());
+
+  return isAllImm ? analyzeCandidatesImm(BaseReg, MBB)
+                  : analyzeCandidatesReg(BaseReg, MBB);
+}
+
+bool ARC::ABAW::analyzeCandidatesImm(unsigned BaseReg, MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "analyzeCandidatesImm: BaseReg = "
+                    << printReg(BaseReg, TRI) << "\n");
+  // Cannot handle multiple increments
+  if (Increments.size() != 1) {
+    LLVM_DEBUG(dbgs() << "\tmultiple increments case is not handled\n");
+    return false;
+  }
+
+  MachineInstr *Incr = Increments[0].first;
+
+  if (!ExternalDominatedUses.empty() &&
+      !canFixPastUses(ExternalDominatedUses, Incr, BaseReg)) {
+    return false;
+  }
+
+  if (Candidates[0].second.getImm() != 0) {
+    // Try to generate preinc instruction first
+    if (tryToGenPreIncImm())
+      return true;
+  }
+
+  if (!checkCandidatesImm()) {
+    // Try to reorder memory instrustions to get foldable chain
+    if (!tryToReorderCandidatesImm() || !checkCandidatesImm()) {
+      return false;
+    }
+  }
+
+  if (Candidates[0].second.getImm() != 0) {
+    if (!tryToTransformPHI(BaseReg, MBB)) {
+      if (!tryToReorderCandidatesImm() || !tryToTransformPHI(BaseReg, MBB))
+        return false;
+    }
+  }
+
+  // TODO: this assert was put in assumption is that checkCandidatesImm does not
+  // depend on PHI transformation. But in fact, it does. Consider:
+  //   LD %0, -68
+  //   LD %0, -4
+  //   ADD %0, 8
+  // Before PHI transform, checkCandidatesImm would return true (as offset
+  // deltas are all positive), but after transfor it will turn into
+  //   LD %1, 0
+  //   LD %1, 64
+  //   ADD %1, 8
+  // Second LD would need negative offset, so checkCandidatesImm will return
+  // false at -Os Need to fix checkCandidatesImm to handle non-zero first offset
+  // TODO: In the above, only first load needs to be changed to LD_ab
+  // testcase: audio_codecs/sbc_decoder/src/decoder/sbc_decoder.c @ -Os
+  // hs45d_voice_audio assert(checkCandidatesImm() && "Unxepected unfoldable
+  // chain detected");
+  if (!checkCandidatesImm())
+    return false;
+
+  // Update offsets with new offset values for postincrements
+  size_t N = Candidates.size();
+  for (size_t i = 0; i < N - 1; ++i) {
+    int64_t NewOffset =
+        Candidates[i + 1].second.getImm() - Candidates[i].second.getImm();
+    Candidates[i].second.setImm(NewOffset);
+  }
+
+  if (ARC::isOptimizeForSpace(MF)) {
+    // We don't have compact ld/st .ab/.aw instructions
+    unsigned CompactSave = 0;
+    for (auto &P : Candidates) {
+      if (P.second.IsImm &&
+          isUInt<5>(P.second.getImm()) && // TODO check u5/u6/u7 ldub/lduh/ld
+          mayHaveShortForm(*P.first))
+        CompactSave += 2; // Would save 2 bytes over incremented ld/st
+    }
+    // TODO/CHECKME check -Wcg,-arc-abaw-max-space=N setting
+    // Do we want different value for -Os1?
+    if (CompactSave > ArcAbawMaxSpace) {
+      LLVM_DEBUG(dbgs() << "analyzeCandidatesImm: Compact form saves "
+                        << CompactSave << " bytes\n");
+      return false;
+    }
+  }
+
+  // Remove increment from candidate list
+  assert(Candidates.back().first == Incr);
+  Candidates.pop_back();
+  return true;
+}
+
+bool ARC::ABAW::analyzeCandidatesReg(unsigned BaseReg, MachineBasicBlock *MBB) {
+
+  LLVM_DEBUG(dbgs() << "analyzeCandidatesReg: BaseReg = "
+                    << printReg(BaseReg, TRI) << "\n");
+  if (!Candidates[0].second.IsImm) {
+    LLVM_DEBUG(
+        dbgs() << "\tcannot handle reg chain starting with non-zero offset\n");
+    return false;
+  }
+
+  // prune duplicate ldst instructions with zero offset
+  unsigned LastZeroOff = 0;
+  for (size_t i = 0; i < Candidates.size(); ++i) {
+    if (Candidates[i].second.IsImm && Candidates[i].second.getImm() == 0)
+      LastZeroOff = i;
+  }
+  if (LastZeroOff != 0) {
+    Candidates.erase(Candidates.begin(), Candidates.begin() + LastZeroOff);
+  }
+
+  bool NeedToPropagateOffsetReg = false;
+  unsigned OffsetReg = Increments[0].second.getReg();
+  MachineInstr *OffDef = getSsaDef(OffsetReg);
+  if (OffDef == nullptr)
+    return false;
+  if (Candidates.size() > 1) {
+    for (auto &C : Candidates) {
+      if (OffDef == C.first || !DOM.dominates(OffDef, C.first))
+        return false;
+      if (C.second.IsImm)
+        continue;
+      unsigned Reg = C.second.getReg();
+      NeedToPropagateOffsetReg |= (Reg != OffsetReg);
+    }
+  }
+
+  std::map<unsigned, RegIncrement> RegValueMap;
+
+  if (NeedToPropagateOffsetReg) {
+    // Instructions use different addend registers
+    // Try to value propagate them to find common value
+    // E.g.
+    // %vreg11<def> = ASL1_rr %vreg5
+    // ...
+    // %vreg90<def> = LD_rras %vreg44, %vreg5    - Addend %vreg5, scale 4
+    // %vreg90<def> = LD %vreg44, %vreg11        - Addend %vreg11, scale 1; can
+    // propagate to %vreg5, scale 2 %vreg53<def> = ADD2_rrr %vreg44, %vreg11  -
+    // Addend %vreg11, scale 4; can propagate to %vreg5, scale 8
+
+    for (auto &C : Candidates) {
+      if (C.second.IsImm)
+        continue;
+      RegIncrement RI = {C.second.getReg(), 1};
+
+      bool done = false;
+      while (!done) {
+        unsigned Reg = RI.Reg;
+        if (MachineInstr *Def = getSsaDef(Reg)) {
+          done = !updateRegIncrement(*Def, RI);
+        } else {
+          done = true;
+        }
+      }
+      unsigned OrigReg = C.second.getReg();
+      if (OrigReg != RI.Reg) {
+        RegValueMap[OrigReg] = RI;
+        LLVM_DEBUG(dbgs() << "\tRecording " << printReg(OrigReg, TRI) << " -> "
+                          << printReg(RI.Reg, TRI) << " * " << RI.Scale
+                          << "\n");
+        RI.Scale *= C.second.getScale();
+        C.second.getRegIncrement() = RI;
+      }
+    }
+    LLVM_DEBUG(dbgs() << "Candidates after value propagation:\n";
+               for (auto &C
+                    : Candidates) {
+                 dbgs() << *C.first << "\t";
+                 C.second.print(dbgs(), TRI);
+                 dbgs() << "\n";
+               });
+  }
+
+  // All instruction in chain must have same offset register with uniform stride
+  size_t N = Candidates.size();
+  unsigned CommonAddend = Candidates[1].second.getReg();
+  int64_t Delta = Candidates[1].second.getScale();
+  for (size_t i = 2; i < N; ++i) {
+    if (Candidates[i].second.getReg() != CommonAddend) {
+      LLVM_DEBUG(dbgs() << "\tcommon addend register not found[" << i
+                        << "]: " << *Candidates[i].first);
+      return false;
+    }
+    int64_t D =
+        Candidates[i].second.getScale() - Candidates[i - 1].second.getScale();
+    if (D != Delta) {
+      LLVM_DEBUG(dbgs() << "\tnon-uniform delta[" << i << "]: " << D << "\n");
+      return false;
+    }
+  }
+
+  if (Delta != 1) {
+    // Try to find existing vreg with necessary value
+    LLVM_DEBUG(dbgs() << "Look for existing vreg for "
+                      << printReg(CommonAddend, TRI) << " * " << Delta << "\n");
+    for (auto &I : RegValueMap) {
+      RegIncrement RI = I.second;
+      if (RI.Reg == CommonAddend && (int64_t)RI.Scale == Delta) {
+        LLVM_DEBUG(dbgs() << "\tFound suitable vreg " << printReg(I.first, TRI)
+                          << ": " << printReg(RI.Reg, TRI) << " * " << RI.Scale
+                          << "\n");
+        CommonAddend = I.first;
+        Delta = 1;
+        break;
+      }
+    }
+  }
+
+  // TODO: We can try to move instructions around
+  MachineInstr *Def = getSsaDef(CommonAddend);
+  if (!Def || !DOM.dominates(Def, Candidates[0].first)) {
+    LLVM_DEBUG(
+        dbgs() << "Def of addend register does not dominate all mem instrs\n");
+    return false;
+  }
+
+  if (!ExternalDominatedUses.empty() &&
+      !canFixPastUses(ExternalDominatedUses, Increments[0].first, BaseReg)) {
+    LLVM_DEBUG(dbgs() << "\tcannot fix past uses \n");
+    return false;
+  }
+
+  if (Delta != 1) {
+    // Cannot find existing vreg holding CommonAddend * Delta
+    // Generate new instruction
+    unsigned R = createVirtReg(CommonAddend);
+    MachineInstr *CommonAddendDef = getSsaDef(CommonAddend);
+    LLVM_DEBUG(dbgs() << "\tCommonAddendDef: " << *CommonAddendDef);
+    if (CommonAddendDef->getParent() == MBB) {
+      LLVM_DEBUG(
+          dbgs()
+          << "Creating new instruction in the same MBB is unprofitable\n");
+      return false;
+    }
+    MachineBasicBlock *InsBB = CommonAddendDef->getParent();
+    MachineBasicBlock::iterator I(CommonAddendDef);
+    ++I;
+    while (I != InsBB->end() && I->getOpcode() == ARC::PHI)
+      ++I;
+    MachineInstr *NewAddend = nullptr;
+    if (isPowerOf2_64(Delta)) {
+      unsigned Shift = Log2_64(Delta);
+      // unsigned ShiftOP = ST.isArc64() ? ARC::ASLL_rru6 : ARC::ASL_rru6;
+      unsigned ShiftOP = ST.isArc64() ? ARC::ASL_rru6 : ARC::ASL_rru6;
+      NewAddend = BuildMI(*InsBB, I, CommonAddendDef->getDebugLoc(),
+                          ST.getInstrInfo()->get(ShiftOP), R)
+                      .addReg(CommonAddend)
+                      .addImm(Shift);
+    } else {
+      if (!ST.hasMpy()) {
+        LLVM_DEBUG(dbgs() << "Target has no MPY option, cannot generate "
+                             "required instruction\n");
+        return false;
+      }
+      if (ST.isArc64() && !ST.hasMpy64()) {
+        LLVM_DEBUG(dbgs() << "Target has no MPY option, cannot generate "
+                             "required instruction\n");
+        return false;
+      }
+      if (ST.isDSPv1()) {
+        LLVM_DEBUG(dbgs() << "Cannot use MPY instruction in DSPv1 ISA\n");
+        return false;
+      }
+      // unsigned MPY = ST.isArc64() ? ARC::MPYL_rru6 : ARC::MPY_rru6;
+      unsigned MPY = ST.isArc64() ? ARC::MPY_rru6 : ARC::MPY_rru6;
+      NewAddend = BuildMI(*InsBB, I, CommonAddendDef->getDebugLoc(),
+                          ST.getInstrInfo()->get(MPY), R)
+                      .addReg(CommonAddend)
+                      .addImm(Delta);
+    }
+    LLVM_DEBUG(dbgs() << "\tCreated new instr for addend: " << *NewAddend);
+    CommonAddend = R;
+  }
+
+  for (auto &C : Candidates) {
+    C.second.IsImm = false;
+    C.second.setReg(CommonAddend);
+    C.second.setScale(1);
+  }
+
+  // Remove increment from candidate list
+  assert(Candidates.back().first == Increments.back().first);
+  Candidates.pop_back();
+  return true;
+}
+
+static unsigned getFormRRAB(unsigned op) {
+  unsigned NewOpcode = ARC::getConversionToRRAB(op);
+  if (NewOpcode == 0)
+    NewOpcode = ARC::getConversionFromScaledToRRAB(op);
+  if (NewOpcode == 0)
+    NewOpcode = ARC::getConversionFromRRToRRAB(op);
+  return NewOpcode;
+}
+
+// Verify all "Candidates" have increment forms
+bool ARC::ABAW::validateIncrementForms() const {
+  // Check first that we can transform all candidates. The ARCv3 FPU
+  // load/store instructions don't have all the normal addressing forms.
+  // Must match logic of loop in transformCandidates()
+  for (const auto &C : Candidates) {
+    MachineInstr *MI = C.first;
+    if (C.second.IsImm) {
+      unsigned NewOpcode =
+          GeneratePreInc ? getConversionToAW(*MI) : getConversionToAB(*MI);
+      if (NewOpcode == 0)
+        return false;
+    } else {
+      if (getFormRRAB(MI->getOpcode()) == 0)
+        return false;
+    }
+  }
+  return true;
+}
+
+bool ARC::ABAW::transformCandidates(unsigned BaseReg, MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "transformCandidates: BaseReg = "
+                    << printReg(BaseReg, TRI) << "\n");
+  unsigned PrevBaseReg = BaseReg;
+  unsigned NewBaseReg = 0;
+
+  // Keep loop in sync with duplicate logic in validateIncrementForms()
+  for (auto &C : Candidates) {
+    MachineInstr *MI = C.first;
+    LLVM_DEBUG(dbgs() << "Change " << *MI);
+    NewBaseReg = MRI.createVirtualRegister(MRI.getRegClass(BaseReg));
+    if (C.second.IsImm) {
+      unsigned NewOpcode =
+          GeneratePreInc ? getConversionToAW(*MI) : getConversionToAB(*MI);
+      assert(NewOpcode != 0 && "Cannot find opcode for register postincrement");
+      int64_t NewOffset = C.second.getImm();
+      if (NewOffset == 0) {
+        MI->getOperand(MI->mayLoad() ? 1 : 0).setReg(PrevBaseReg);
+        MI->getOperand(MI->mayLoad() ? 2 : 1).setImm(0);
+        continue;
+      }
+      // TODO
+      // if (unsigned AVOpcode = getVDSPAVVariant(NewOpcode)) {
+      //   int ShiftAmt = Log2_32(ST.getVDSPVecWidth()/8);
+      //   int64_t ShiftedVal = NewOffset >> ShiftAmt;
+      //   if ((ShiftedVal << ShiftAmt) == NewOffset && (ShiftedVal <= 31 &&
+      //   ShiftedVal >= -32)) {
+      //     NewOpcode = AVOpcode;
+      //     NewOffset = ShiftedVal;
+      //   }
+      // }
+      MI->setDesc(ST.getInstrInfo()->get(NewOpcode));
+      setLoadStoreBaseOffset(*MI, NewBaseReg, PrevBaseReg, NewOffset);
+    } else {
+      unsigned NewOpcode = getFormRRAB(MI->getOpcode());
+      assert(NewOpcode != 0 && "Cannot find opcode for register postincrement");
+      MI->setDesc(ST.getInstrInfo()->get(NewOpcode));
+      MachineOperand Opnd = MachineOperand::CreateReg(C.second.getReg(), false);
+      setLoadStoreBaseOffset(*MI, NewBaseReg, PrevBaseReg, Opnd);
+      tiedRegisterMap[BaseReg] = {NewBaseReg, Opnd.getReg(), MI};
+    }
+    PrevBaseReg = NewBaseReg;
+    LLVM_DEBUG(dbgs() << "To     " << *MI);
+  }
+
+  // Replace Inc's result with last created new base register
+  assert(Increments.size() == 1);
+  MachineInstr *Incr = Increments[0].first;
+  toBeDeleted(Incr);
+  if (!ExternalDominatedUses.empty()) {
+    if (Increments[0].second.IsImm) {
+      fixIntermediates(Candidates.back().first, PrevBaseReg,
+                       Increments[0].second.getImm(), BaseReg);
+    } else {
+      fixIntermediatesReg(
+          Candidates.back().first,         // after this insns
+          PrevBaseReg,                     // use this as a new base
+          Increments[0].second.getReg(),   // adjust by this reg
+          Increments[0].second.getScale(), // scaled by this value
+          BaseReg);                        // original base reg
+    }
+  }
+  ARC::replaceAllUsesWith(MRI, Incr->getOperand(0).getReg(), PrevBaseReg);
+
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
 
-FunctionPass *llvm::createARCOptAddrMode() { return new ARCOptAddrMode(); }
+FunctionPass *llvm::createARCOptAddrMode(const ARCSubtarget &ST) {
+  return new ARCOptAddrMode(ST);
+}
Index: llvm/lib/Target/ARC/ARCSubtarget.h
===================================================================
--- llvm/lib/Target/ARC/ARCSubtarget.h
+++ llvm/lib/Target/ARC/ARCSubtarget.h
@@ -63,6 +63,18 @@
   }
 
   bool hasNorm() const { return Xnorm; }
+
+  // TODO: make configurable
+  bool isDSPv1() const { return false; }
+
+  // TODO: make configurable
+  bool hasMpy() const { return true; }
+
+  // TODO add ARC64 support
+  bool isArc64() const { return false; }
+
+  // TODO: make configurable
+  bool hasMpy64() const { return false; }
 };
 
 } // end namespace llvm
Index: llvm/lib/Target/ARC/ARCTargetMachine.h
===================================================================
--- llvm/lib/Target/ARC/ARCTargetMachine.h
+++ llvm/lib/Target/ARC/ARCTargetMachine.h
@@ -31,6 +31,7 @@
                    CodeGenOpt::Level OL, bool JIT);
   ~ARCTargetMachine() override;
 
+  const ARCSubtarget &getSubtarget() const { return Subtarget; }
   const ARCSubtarget *getSubtargetImpl() const { return &Subtarget; }
   const ARCSubtarget *getSubtargetImpl(const Function &) const override {
     return &Subtarget;
Index: llvm/lib/Target/ARC/ARCTargetMachine.cpp
===================================================================
--- llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -47,9 +47,12 @@
 
 /// ARC Code Generator Pass Configuration Options.
 class ARCPassConfig : public TargetPassConfig {
+private:
+  const ARCSubtarget &ST;
+
 public:
   ARCPassConfig(ARCTargetMachine &TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM), ST(TM.getSubtarget()) {}
 
   ARCTargetMachine &getARCTargetMachine() const {
     return getTM<ARCTargetMachine>();
@@ -74,8 +77,8 @@
 void ARCPassConfig::addPreEmitPass() { addPass(createARCBranchFinalizePass()); }
 
 void ARCPassConfig::addPreRegAlloc() {
-    addPass(createARCExpandPseudosPass());
-    addPass(createARCOptAddrMode());
+  addPass(createARCExpandPseudosPass());
+  addPass(createARCOptAddrMode(ST));
 }
 
 // Force static initialization.
Index: llvm/lib/Target/ARC/ARCUtil.h
===================================================================
--- /dev/null
+++ llvm/lib/Target/ARC/ARCUtil.h
@@ -0,0 +1,382 @@
+//===- ARCUtil.h ----------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file defines utility functions and classes used throughout
+/// the ARC code generator.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARC_ARCUTIL_H
+#define LLVM_LIB_TARGET_ARC_ARCUTIL_H
+
+#include "ARC.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+#if 0
+} fix emacs;
+#endif
+
+class MachineDominatorTree;
+
+namespace ARC {
+#if 0
+} fix emacs;
+#endif
+
+// Determine if the MachineFunction should be optimized for code size
+// over performance
+extern bool isOptimizeForSpace(const MachineFunction &);
+
+// Return the virtual Register number when the operand is a virtual
+// register otherwise return zero
+inline Register getVReg(const MachineOperand &o) {
+  if (!o.isReg())
+    return 0;
+  auto R = o.getReg();
+  if (!Register::isVirtualRegister(R))
+    return 0;
+  return R;
+}
+
+// Return the physical Register number when the operand is a physical
+// register otherwise return zero
+inline Register getPReg(const MachineOperand &o) {
+  if (!o.isReg())
+    return 0;
+  auto R = o.getReg();
+  if (!Register::isPhysicalRegister(R))
+    return 0;
+  return R;
+}
+
+// Determine if the given virtual Register is unused (a debug info use
+// is excluded). False is returned if the Register provided is not a
+// virtual Register.
+static inline bool isUnusedVReg(const MachineRegisterInfo &MRI, Register R) {
+  if (!Register::isVirtualRegister(R))
+    return false;
+  return MRI.use_nodbg_empty(R);
+}
+
+// Determine if the given virtual Register is unused (a debug info use
+// is excluded). False is returned if the Register provided is not a
+// virtual Register.
+static inline bool isUnusedVReg(const MachineRegisterInfo &MRI,
+                                const MachineOperand &opd) {
+  if (!opd.isReg())
+    return false;
+  return isUnusedVReg(MRI, opd.getReg());
+}
+
+// Find the unique definition of a virtual Register else return
+// nullptr
+static inline MachineInstr *getVRegDef(const MachineRegisterInfo &MRI,
+                                       Register RegNo) {
+  if (!Register::isVirtualRegister(RegNo))
+    return nullptr;
+  if (!MRI.hasOneDef(RegNo))
+    return nullptr;
+  return MRI.getVRegDef(RegNo);
+}
+
+// Find the unique definition of a virtual Register else return
+// nullptr
+static inline MachineInstr *getVRegDef(const MachineRegisterInfo &RINFO,
+                                       const MachineOperand &o) {
+  if (!o.isReg())
+    return nullptr;
+  if (o.isUndef())
+    return nullptr;
+  return getVRegDef(RINFO, o.getReg());
+}
+
+// Create a new virtual register that has the same register class as "virtReg"
+extern Register createVirtReg(MachineRegisterInfo *, Register virtReg);
+
+// Create a new virtual register that has the same register class as
+// "virtRegOpd"
+extern Register createVirtReg(MachineRegisterInfo *, const MachineOperand &);
+
+// Find the constant value of the operand
+extern bool getImmed(MachineRegisterInfo &MRI, const MachineOperand &opd,
+                     int64_t &imm);
+
+// is "MI" a load or store that can handle "offset" as a displacement?
+bool isLoadStoreThatCanHandleDisplacement(const MachineInstr &MI,
+                                          int64_t offset);
+
+// Determine if MI is an add instruction with a constant operand
+bool isAddConstantOp(const MachineInstr &MI, int64_t *amount = nullptr);
+
+// Insert an add-by-constant, using the most efficient 32-bit opcode.
+MachineInstr *buildAddByConstant(const MachineRegisterInfo &,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator InsertPoint,
+                                 DebugLoc DL, Register DestReg, Register SrcReg,
+                                 int constant);
+
+// Returns true if machine instruction MI dominates all uses of virtual register
+// VReg
+bool dominatesAllUsesOf(MachineInstr *, Register VReg, MachineDominatorTree *,
+                        MachineRegisterInfo *);
+
+// Is "reg" read between MI (exclusive) and ME (exclusive)?
+// Should work on virtual or physical registers.
+// If "stopIfModified" is true, then return false if the (physical) register
+// is modified before being read.
+bool isUsedBetween(Register reg, MachineBasicBlock::const_iterator MI,
+                   MachineBasicBlock::const_iterator ME,
+                   bool isExclusiveMI = true, bool stopIfTrashed = false);
+
+// Replace all uses of "FromReg" to "ToReg". Does not modify defs
+void replaceAllUsesWith(MachineRegisterInfo &, Register FromReg,
+                        Register ToReg);
+
+// A visitor pattern class used throughout the ARC code generator to
+// write optimizations in a structured way
+class InstructionVisitor {
+public:
+  enum IterationDirection { Forward, Reverse };
+
+protected:
+  MachineFunction &MF;
+  const ARCSubtarget &ST;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo &MRI;
+  bool changed = false; // Records if any changes have been made to MF
+  bool quit = false;    // Records that the user wants to stop the iteration
+
+  // A container to hold instructions which will be deleted upon
+  // completion of iterating over a MachineBasicBlock
+  SmallVector<MachineInstr *, 8> tbd;
+
+  // Deletes each MachineInstr in container "tbd"
+  void deletePending();
+
+  // Register an instruction to be deleted by a later call to "deletePending()".
+  // If the instruction is not going to be referenced later,
+  // then pass "clearOperands" as true so that there are no longer any
+  // register references to the soon-to-be-deleted instruction.
+  void toBeDeleted(MachineInstr *MI, bool ClearOperands = false);
+
+  // Iterates and calls the visitor methods for each MachineInstr in
+  // the MachineBasicBlock in forward direction [first..last]
+  bool iterateBlockForward(MachineBasicBlock &);
+
+  // Iterates and calls the visitor methods for each MachineInstr in
+  // the MachineBasicBlock in reverse direction [last..first]
+  bool iterateBlockBackward(MachineBasicBlock &);
+
+  // Iterates and calls the visitor methods for each MachineInstr in
+  // the MachineBasicBlock in the specified direction. It will repeat
+  // up to \p maxRepetitions times but terminates if no changes were
+  // made.
+  bool iterateBlock(MachineBasicBlock &, IterationDirection,
+                    unsigned maxRepetitions);
+
+  // Iterates over every MachineInstr in every MachineBasicBlock and
+  // calls the visitor methods in the specified direction. It will
+  // repeat up to \p maxRepetitions times but terminates if no changes
+  // were made.
+  bool iterate(IterationDirection, unsigned maxRepetitions);
+
+  // Handles calling visit() and the pre/post visit callouts as well
+  // as maintaining the "changed" member variable
+  bool visitOneInstruction(MachineInstr &);
+
+public:
+  InstructionVisitor(const ARCSubtarget &, MachineFunction &);
+  virtual ~InstructionVisitor() {}
+
+  // This is the main method expected to be overriden by the user. It
+  // will be called for each MachineInstr in each MachineBasicBlock in
+  // the IterationDirection specified by the user. The returned value
+  // should be "true" if any changes where made to the
+  // MachineFunction.
+  virtual bool visit(MachineInstr &) { return false; }
+
+  // Determine if any changes have been made to the MachineFunction
+  bool anyChanges() const { return changed; }
+
+  // Return "true" if the specified MachineBasicBlock should be
+  // excluded from the current iteration
+  virtual bool excludeBlock(MachineBasicBlock &) { return false; }
+
+  // Specify that the iteration should terminate immediately
+  void halt() { quit = true; }
+
+  // Visitor which is called prior to iterating over the specified
+  // MachineBasicBlock
+  virtual void preBlockCallout(MachineBasicBlock &) {}
+
+  // Visitor which is called after iterating over the specified
+  // MachineBasicBlock
+  virtual void postBlockCallout(MachineBasicBlock &, bool BlockWasChanged) {}
+
+  // Visitor which is called prior to calling visit() for the
+  // specified MachineInstr
+  virtual void preVisitCallout(MachineInstr &) {}
+
+  // Visitor which is called after to calling visit() for the
+  // specified MachineInstr
+  virtual void postVisitCallout(MachineInstr &, bool) {}
+
+  // Examine each instruction in the given basic block in forward order.
+  // The callback/visitor is called after advancing the iterator, so it
+  // is allowed to change/delete the current instruction, but it is
+  // unsafe to delete instructions forward of the one being visited.
+  bool examineEachInstr(MachineBasicBlock &MBB) {
+    return iterateBlock(MBB, Forward, 1);
+  }
+
+  // Examine each instruction in the given basic block in reverse order.
+  // The callback/visitor is called after advancing the iterator, so it
+  // is allowed to change/delete the current instruction, but it is
+  // unsafe to delete instructions before of the one being visited.
+  bool examineEachInstrInReverse(MachineBasicBlock &MBB) {
+    return iterateBlock(MBB, Reverse, 1);
+  }
+
+  // NOTE: The callback/visitor is called after advancing the iterator,
+  // so it is allowed to change/delete the current instruction, but it
+  // is unsafe to delete instructions before of the one being visited.
+  bool examineEachInstrInReverse() { return iterate(Reverse, 1); }
+
+  // NOTE: The callback/visitor is called after advancing the iterator,
+  // so it is allowed to change/delete the current instruction, but it
+  // is unsafe to delete instructions before of the one being visited.
+  bool examineEachInstrInReverseRepeatedly(unsigned maxRepetitions) {
+    return iterate(Reverse, maxRepetitions);
+  }
+
+  // NOTE: The callback/visitor is called after advancing the iterator,
+  // so it is allowed to change/delete the current instruction, but it
+  // is unsafe to delete instructions forward of the one being visited.
+  bool examineEachInstr() { return iterate(Forward, 1); }
+
+  // NOTE: The callback/visitor is called after advancing the iterator,
+  // so it is allowed to change/delete the current instruction, but it
+  // is unsafe to delete instructions forward of the one being visited.
+  bool examineEachInstrRepeatedly(unsigned maxRepetitions) {
+    return iterate(Forward, maxRepetitions);
+  }
+
+  // Attempt to compute the value of the specified operand. If
+  // successful the value is written to the output int64_t parameter
+  // and "true" is returned.
+  virtual bool getImmed(const MachineOperand &,
+                        /*Output*/ int64_t &) const;
+
+  // Attempt to compute the value of the specified operand. If
+  // successful and the known value matches \p val return "true".
+  bool isImmedVal(const MachineOperand &opd, int64_t val) const {
+    int64_t n;
+    return getImmed(opd, n) && n == val;
+  }
+
+  // Getter for getting the bitcode Function for the current
+  // MachineFunction
+  Function &getFunction() const { return MF.getFunction(); }
+
+}; // end class ARC::InstructionVisitor
+
+// ARC::InstructionVisitor for a MachineFunction prior to register
+// allocation. It provides additional functionality that pertains to a
+// MachineFunction prior to register allocation (while in SSA form).
+class SsaInstructionVisitor : public InstructionVisitor {
+public:
+  SsaInstructionVisitor(const ARCSubtarget &ST, MachineFunction &MF)
+      : InstructionVisitor(ST, MF) {}
+  virtual ~SsaInstructionVisitor() {}
+
+  // Find the SSA definition for the specified virtual register
+  MachineInstr *getSsaDef(Register RegNum) const {
+    return ARC::getVRegDef(MRI, RegNum);
+  }
+
+  // Find the SSA definition for the specified virtual register
+  MachineInstr *getSsaDef(const MachineOperand &opd) const {
+    return ARC::getVRegDef(MRI, opd);
+  }
+
+  // Returns true if the virtual register has exactly 1 non-debug use
+  bool hasOneUse(Register RegNum) const {
+    if (!Register::isVirtualRegister(RegNum))
+      return false;
+    if (!MRI.hasOneDef(RegNum))
+      return false;
+    return MRI.hasOneNonDBGUse(RegNum);
+  }
+
+  // Returns true if the virtual register has exactly 1 non-debug use
+  bool hasOneUse(const MachineOperand &opd) const {
+    return (opd.isReg()) ? hasOneUse(opd.getReg()) : false;
+  }
+
+  // Return the SSA definition of a virtual register IFF it has
+  // exactly 1 non-debug use
+  MachineInstr *getSingleUseSsaDef(Register RegNum) const {
+    if (!Register::isVirtualRegister(RegNum))
+      return nullptr;
+    if (!MRI.hasOneDef(RegNum))
+      return nullptr;
+    if (!MRI.hasOneNonDBGUse(RegNum))
+      return nullptr;
+    return MRI.getVRegDef(RegNum);
+  }
+
+  // Return the SSA definition of a virtual register IFF it has
+  // exactly 1 non-debug use
+  MachineInstr *getSingleUseSsaDef(const MachineOperand &opd) const {
+    return (opd.isReg()) ? getSingleUseSsaDef(opd.getReg()) : nullptr;
+  }
+
+  // Return the unique use of a virtual register
+  MachineOperand *getUniqueSsaUse(Register RegNum) const {
+    if (!Register::isVirtualRegister(RegNum))
+      return nullptr;
+    if (!MRI.hasOneNonDBGUse(RegNum))
+      return nullptr;
+    return &(*MRI.use_nodbg_begin(RegNum));
+  }
+
+  // Return the unique use of a virtual register
+  MachineOperand *getUniqueSsaUse(const MachineOperand &opd) const {
+    return (opd.isReg()) ? getUniqueSsaUse(opd.getReg()) : nullptr;
+  }
+
+  // Return true if the specified virtual register has NO uses
+  bool isUnusedVirtReg(const MachineOperand &opd) const {
+    return isUnusedVReg(MRI, opd);
+  }
+
+  // Create a virtual register with the same register class as the
+  // specified virtual register
+  Register createVirtReg(Register virtReg) {
+    return ARC::createVirtReg(&MRI, virtReg);
+  }
+
+  // Create a virtual register with the same register class as the
+  // specified virtual register
+  Register createVirtReg(const MachineOperand &virtRegOpd) {
+    return ARC::createVirtReg(&MRI, virtRegOpd);
+  }
+
+  // Find the manifold constant integer value of the specified
+  // MachineOperand. Returns "true" if the value could be determined.
+  bool getImmed(const MachineOperand &, /*Output*/ int64_t &) const override;
+
+}; // end class ARC::SsaInstructionVisitor
+
+} // End namespace ARC
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARC_ARCUTIL_H
Index: llvm/lib/Target/ARC/ARCUtil.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/ARC/ARCUtil.cpp
@@ -0,0 +1,455 @@
+//===- ARCUtil.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file defines utility functions and classes used throughout
+/// the ARC code generator.
+//===----------------------------------------------------------------------===//
+
+#include "ARCUtil.h"
+#include "ARC.h"
+#include "ARCSubtarget.h"
+#include "ARCUtil.h"
+#include "MCTargetDesc/ARCMCUtil.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineDominators.h"
+
+using namespace llvm;
+
+// Determine if the function was compiled with -Os or with attribute
+// minsize
+bool ARC::isOptimizeForSpace(const MachineFunction &MF) {
+  return MF.getFunction().hasFnAttribute(Attribute::OptimizeForSize);
+}
+
+// Create a new virtual register that has the same register class as "virtReg"
+Register ARC::createVirtReg(MachineRegisterInfo *MRI, Register virtReg) {
+  assert(Register::isVirtualRegister(virtReg));
+  const TargetRegisterClass *RC = MRI->getRegClass(virtReg);
+  return MRI->createVirtualRegister(RC);
+}
+
+// Create a new virtual register that has the same register class as
+// "virtRegOpd"
+Register ARC::createVirtReg(MachineRegisterInfo *MRI,
+                            const MachineOperand &virtRegOpd) {
+  assert(virtRegOpd.isReg());
+  return createVirtReg(MRI, virtRegOpd.getReg());
+}
+
+// Create the appropriate ADD MachineInstr for the given register
+// classes and specified constant
+MachineInstr *ARC::buildAddByConstant(const MachineRegisterInfo &MRI,
+                                      MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator InsertPoint,
+                                      DebugLoc DL, Register DestReg,
+                                      Register SrcReg, int constant) {
+  unsigned addOp = 0;
+  bool is32bit = true;
+  // TODO
+  // if (Register::isVirtualRegister(DestReg)) {
+  //     if (MRI.getRegClass(DestReg) == &ARC::WideCoreRegClass ||
+  //         MRI.getRegClass(DestReg) == &ARC::WideCore8RegClass)
+  //         is32bit = false;
+  // } else if (ARC::isReg64(DestReg)) {
+  //     is32bit = false;
+  // }
+  if (constant > 0) {
+    // TODO
+    // if (constant <= 0x3F) addOp = is32bit ? ARC::ADD_rru6 : ARC::ADDL_rru6;
+    if (constant <= 0x3F)
+      addOp = is32bit ? ARC::ADD_rru6 : ARC::ADD_rru6;
+    // TODO
+    // else if ( (constant & 0x7E) == constant) addOp = is32bit ?
+    // ARC::ADD1SCALED_rru6 : ARC::ADD1LSCALED_rru6; else if ( (constant & 0xFC)
+    // == constant) addOp = is32bit ? ARC::ADD2SCALED_rru6 :
+    // ARC::ADD2LSCALED_rru6; else if ( (constant & 0x1F8) == constant) addOp =
+    // is32bit ? ARC::ADD3SCALED_rru6 : ARC::ADD3LSCALED_rru6;
+    // TODO
+    // else if (constant < 2048) addOp = is32bit ? ARC::ADD_rrs12 :
+    // ARC::ADDL_rrs12; else addOp = is32bit ? ARC::ADD_rrlimm :
+    // ARC::ADDL_rrlimm;
+    else if (constant < 2048)
+      addOp = is32bit ? ARC::ADD_rrs12 : ARC::ADD_rrs12;
+    else
+      addOp = is32bit ? ARC::ADD_rrlimm : ARC::ADD_rrlimm;
+  } else if (constant == 0) {
+    return BuildMI(
+               MBB, InsertPoint, DL,
+               MBB.getParent()->getSubtarget().getInstrInfo()->get(ARC::COPY))
+        .addReg(DestReg, RegState::Define)
+        .addReg(SrcReg, 0);
+  } else {
+    constant = -constant;
+    // TODO
+    // if (constant <= 0x3F) addOp = is32bit ? ARC::SUB_rru6 : ARC::SUBL_rru6;
+    if (constant <= 0x3F)
+      addOp = is32bit ? ARC::SUB_rru6 : ARC::SUB_rru6;
+    // TODO
+    // else if ( (constant & 0x7E) == constant) addOp = is32bit ?
+    // ARC::SUB1SCALED_rru6 : ARC::SUB1LSCALED_rru6; else if ( (constant & 0xFC)
+    // == constant) addOp = is32bit ? ARC::SUB2SCALED_rru6 :
+    // ARC::SUB2LSCALED_rru6; else if ( (constant & 0x1F8) == constant) addOp =
+    // is32bit ? ARC::SUB3SCALED_rru6 : ARC::SUB3LSCALED_rru6; else if (constant
+    // < 2048) {addOp = is32bit ? ARC::ADD_rrs12 : ARC::ADDL_rrs12; constant =
+    // -constant; }
+    // TODO
+    // else addOp = is32bit ? ARC::SUB_rrlimm : ARC::SUBL_rrlimm;
+    else
+      addOp = is32bit ? ARC::SUB_rrlimm : ARC::SUB_rrlimm;
+  }
+  return BuildMI(MBB, InsertPoint, DL,
+                 MBB.getParent()->getSubtarget().getInstrInfo()->get(addOp))
+      .addReg(DestReg, RegState::Define)
+      .addReg(SrcReg, 0)
+      .addImm(constant);
+}
+
+// Determine if \p amount is in the specified range and has the
+// appropriate alignment (mask)
+static bool isAdjustedConstantInRange(const MachineOperand &opd, int64_t amount,
+                                      int64_t lo, int64_t hi, int64_t mask) {
+  if (opd.isImm()) {
+    int64_t k = opd.getImm() + amount;
+    return (mask & k) == 0 && k >= lo && k <= hi;
+  }
+  return false;
+}
+
+// is "MI" a load or store that can handle "offset" as a displacement?
+bool ARC::isLoadStoreThatCanHandleDisplacement(const MachineInstr &MI,
+                                               int64_t offset) {
+  int LogScale = 0;
+  // This function used to be isS9LoadStore, with no pre/post increment forms
+  // recognized. Setting AllowInc to false preserves that behaviour.
+  if (ARC::isS9LoadStoreOpcode(MI.getOpcode(), LogScale, /*AllowInc=*/false)) {
+    // NB: mayLoad rather than mayStore as the above includes PREFETCH ops
+    // without a dest register.
+    int ImmIndex = MI.mayLoad() ? 2 : 1;
+    if (MI.getOperand(ImmIndex).isImm())
+      return isAdjustedConstantInRange(MI.getOperand(ImmIndex), offset,
+                                       -(256 << LogScale), 255 << LogScale,
+                                       (1u << LogScale) - 1);
+  }
+  return false;
+}
+
+// Determine if MI is an add instruction with a constant operand
+bool ARC::isAddConstantOp(const MachineInstr &MI, int64_t *amount) {
+  switch (MI.getOpcode()) {
+#undef CASE
+#define CASE(OP, Sign, Shift)                                                  \
+  case ARC::OP##_rrlimm:                                                       \
+  case ARC::OP##_rrs12:                                                        \
+  case ARC::OP##_rru6:                                                         \
+    if (MI.getOperand(2).isImm()) {                                            \
+      if (amount)                                                              \
+        *amount = Sign * MI.getOperand(2).getImm() << Shift;                   \
+      return true;                                                             \
+    }                                                                          \
+    break
+    CASE(ADD, 1, 0);
+    CASE(ADD1, 1, 1);
+    CASE(ADD2, 1, 2);
+    CASE(ADD3, 1, 3);
+    CASE(SUB, -1, 0);
+    CASE(SUB1, -1, 1);
+    CASE(SUB2, -1, 2);
+    CASE(SUB3, -1, 3);
+  }
+  return false;
+}
+
+// Determine if \p MI dominates all uses of the specifed virtual
+// register
+bool ARC::dominatesAllUsesOf(MachineInstr *MI, Register VReg,
+                             MachineDominatorTree *MDT,
+                             MachineRegisterInfo *MRI) {
+
+  assert(Register::isVirtualRegister(VReg) && "Expected virtual register!");
+
+  for (auto it = MRI->use_nodbg_begin(VReg), end = MRI->use_nodbg_end();
+       it != end; ++it) {
+    MachineInstr *User = it->getParent();
+    if (User->isPHI()) {
+      unsigned BBOperandIdx = User->getOperandNo(&*it) + 1;
+      MachineBasicBlock *MBB = User->getOperand(BBOperandIdx).getMBB();
+      if (MBB->empty()) {
+        MachineBasicBlock *InstBB = MI->getParent();
+        assert(InstBB != MBB && "Instruction found in empty MBB");
+        if (!MDT->dominates(InstBB, MBB))
+          return false;
+        continue;
+      }
+      User = &*MBB->rbegin();
+    }
+
+    if (!MDT->dominates(MI, User))
+      return false;
+  }
+  return true;
+}
+
+// Is "reg" read between MI (exclusive) and ME (exclusive)?
+//
+// Specifically, if this function returns false, then an instruction that
+// defines "reg" at MI can be moved down to before ME, assuming all other
+// constraints have been checked.
+//
+// Should work on virtual or physical registers.
+bool ARC::isUsedBetween(Register reg, MachineBasicBlock::const_iterator MI,
+                        MachineBasicBlock::const_iterator ME,
+                        bool isExclusiveMI, bool stopIfModified) {
+  MachineBasicBlock::const_iterator end = MI->getParent()->end();
+  if (MI == ME)
+    return false;
+  assert((end == ME || MI->getParent() == ME->getParent()) &&
+         "Not in same block");
+
+  const TargetRegisterInfo *TRI =
+      MI->getParent()->getParent()->getSubtarget().getRegisterInfo();
+
+  // If we're testing an extension register or possibly XY register, then resort
+  // to more complex logic.
+#if 0 // TODO upstream
+    if (Register::isPhysicalRegister(reg) && ARC::getRegToIndex(reg) >= 32){
+        const ArcRegisterInfo *ATRI = static_cast<const ArcRegisterInfo*>(TRI);
+        if (ATRI->isXYReg(reg))  // Deal with XY...
+            return isXYUsedBetween(reg,MI,ME,ATRI,isExclusiveMI);
+        const ArcModuleInfo &MINFO = ArcModuleInfo::Find(MI->getParent()->getParent()->getFunction().getParent());
+        const ArcModuleInfo::RegDesc *rd = MINFO.getRegister(reg);
+        if (rd != nullptr && rd->sideEffectOnRead)
+            return isExtensionRegUsedBetween(reg,*rd,MI,ME,ATRI,MINFO, isExclusiveMI);
+    }
+#endif
+
+  if (isExclusiveMI)
+    ++MI;
+  for (; MI != ME && MI != end; ++MI) {
+    if (MI->isDebugInstr())
+      continue;
+    if (MI->readsRegister(reg, TRI))
+      return true;
+    if (stopIfModified && MI->modifiesRegister(reg, TRI))
+      break;
+  }
+  return false;
+}
+
+// Replace all uses of "FromReg" to "ToReg". Does not modify defs
+void ARC::replaceAllUsesWith(MachineRegisterInfo &MRI, Register FromReg,
+                             Register ToReg) {
+  assert(FromReg != ToReg && "Cannot replace a reg with itself");
+  // The logic of this loop was copied from
+  // MachineRegisterInfo::replaceRegWith(...)
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
+                                         E = MRI.use_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+    O.setReg(ToReg);
+  }
+}
+
+// Handles calling visit() and the pre/post visit callouts as well
+// as maintaining the "changed" member variable
+bool ARC::InstructionVisitor::visitOneInstruction(MachineInstr &inst) {
+  preVisitCallout(inst);
+  bool instChanged = visit(inst);
+  changed |= instChanged;
+  postVisitCallout(inst, instChanged);
+  return instChanged;
+}
+
+bool ARC::InstructionVisitor::iterateBlockForward(MachineBasicBlock &MBB) {
+  if (excludeBlock(MBB))
+    return false;
+  quit = false;
+  preBlockCallout(MBB);
+  bool blockChanged = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+       I != E && !quit;) {
+    // NOTE: The callback/visitor is called after advancing the iterator,
+    // so it is allowed to change/delete the current instruction.
+    MachineBasicBlock::iterator J = I;
+    ++I;
+    bool instChanged = visitOneInstruction(*J);
+    blockChanged |= instChanged;
+  }
+  deletePending();
+  postBlockCallout(MBB, blockChanged);
+  return blockChanged;
+}
+
+bool ARC::InstructionVisitor::iterateBlockBackward(MachineBasicBlock &MBB) {
+  if (excludeBlock(MBB))
+    return false;
+  quit = false;
+  preBlockCallout(MBB);
+  bool blockChanged = false;
+  // Ugliness to be sure reverse_iterator is moved off of
+  // the current instruction in case we delete it
+  MachineInstr *p[2] = {nullptr, nullptr};
+  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+       I != E && !quit; ++I) {
+    MachineInstr &Inst = *I;
+    MachineInstr *T = p[0];
+    p[0] = p[1];
+    p[1] = &Inst;
+    if (T) {
+      bool instChanged = visitOneInstruction(*T);
+      blockChanged |= instChanged;
+    }
+  }
+  for (unsigned i = 0; i < 2 && !quit; ++i) {
+    if (p[i]) {
+      bool instChanged = visitOneInstruction(*p[i]);
+      blockChanged |= instChanged;
+    }
+  }
+  deletePending();
+  postBlockCallout(MBB, blockChanged);
+  return blockChanged;
+}
+
+bool ARC::InstructionVisitor::iterateBlock(MachineBasicBlock &MBB,
+                                           IterationDirection direction,
+                                           unsigned maxRepetitions) {
+  if (direction == Forward) {
+    for (unsigned K = 0; K < maxRepetitions; ++K) {
+      if (!iterateBlockForward(MBB))
+        break;
+    }
+  } else {
+    for (unsigned K = 0; K < maxRepetitions; ++K) {
+      if (!iterateBlockBackward(MBB))
+        break;
+    }
+  }
+  return changed;
+}
+
+bool ARC::InstructionVisitor::iterate(IterationDirection direction,
+                                      unsigned maxRepetitions) {
+  if (direction == Forward) {
+    bool changing = true;
+    for (unsigned K = 0; changing && K < maxRepetitions; ++K) {
+      changing = false;
+      ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+      for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+               I = RPOT.begin(),
+               E = RPOT.end();
+           I != E; ++I) {
+        auto *MBB = *I;
+        // for (auto& MBB : MF) {
+        //  Update "changing" because we iterate "while changing"
+        //  up to "maxRepititions".
+        changing |= iterateBlockForward(*MBB);
+        // changed is handled in visitOneInstruction()
+      }
+    }
+  } else {
+    bool changing = true;
+    for (unsigned K = 0; changing && K < maxRepetitions; ++K) {
+      changing = false;
+      for (po_iterator<MachineFunction *> I = po_begin(&MF), E = po_end(&MF);
+           I != E; ++I) {
+        MachineBasicBlock *MBB = *I;
+        // Update "changing" because we iterate "while changing"
+        // up to "maxRepititions".
+        changing |= iterateBlockBackward(*MBB);
+        // changed is handled in visitOneInstruction()
+      }
+    }
+  }
+  // Return if anything ever changed while iterating over the entire
+  // MachineFunction
+  return anyChanges();
+}
+
+ARC::InstructionVisitor::InstructionVisitor(const ARCSubtarget &st,
+                                            MachineFunction &mf)
+    : MF(mf), ST(st), TRI(ST.getRegisterInfo()), MRI(mf.getRegInfo()) {}
+
+bool ARC::InstructionVisitor::getImmed(const MachineOperand &opd,
+                                       int64_t &imm) const {
+  if (opd.isImm()) {
+    imm = opd.getImm();
+    return true;
+  }
+  return false;
+}
+
+bool ARC::SsaInstructionVisitor::getImmed(const MachineOperand &opd,
+                                          int64_t &imm) const {
+  if (InstructionVisitor::getImmed(opd, imm))
+    return true;
+  return ARC::getImmed(MRI, opd, imm);
+}
+
+// Find the constant value of the operand
+bool ARC::getImmed(MachineRegisterInfo &MRI, const MachineOperand &opd,
+                   int64_t &imm) {
+  if (opd.isImm()) {
+    imm = opd.getImm();
+    return true;
+  }
+  MachineInstr *def = ARC::getVRegDef(MRI, opd);
+  if (def == nullptr)
+    return false;
+  switch (def->getOpcode()) {
+  case TargetOpcode::COPY:
+    // When it is a sub-register we may have to truncate it
+    if (def->getOperand(1).getSubReg())
+      break;
+    return ARC::getImmed(MRI, def->getOperand(1), imm);
+  case ARC::MOV_ru6:
+  case ARC::MOV_rs12:
+  case ARC::MOV_rlimm:
+    if (def->getOperand(1).isImm()) {
+      imm = def->getOperand(1).getImm();
+      return true;
+    }
+    break;
+#undef CASE
+#define CASE(OP, size)                                                         \
+  case ARC::OP##_rr:                                                           \
+    if (getImmed(MRI, def->getOperand(1), imm)) {                              \
+      imm <<= 64 - size;                                                       \
+      imm >>= 64 - size;                                                       \
+      return true;                                                             \
+    }                                                                          \
+    break
+    CASE(SEXB, 8);
+    CASE(SEXH, 16);
+  } // end switch
+  return false;
+}
+
+void ARC::InstructionVisitor::toBeDeleted(MachineInstr *inst,
+                                          bool clearOperands) {
+  // Change the descrptor so current iteration does not trip some other
+  // optimization on a deleted instruction
+  if (inst && inst->getOpcode() != ARC::TBD) {
+    inst->setDesc(ST.getInstrInfo()->get(ARC::TBD));
+    if (clearOperands) {
+      // Remove all operands so that we don't see reg references.
+      while (inst->getNumOperands() > 0)
+        inst->RemoveOperand(inst->getNumOperands() - 1);
+    }
+    tbd.push_back(inst);
+  }
+}
+
+void ARC::InstructionVisitor::deletePending() {
+  for (auto *MI : tbd) {
+    MI->eraseFromParent();
+    changed = true;
+  }
+  tbd.clear();
+}
Index: llvm/lib/Target/ARC/CMakeLists.txt
===================================================================
--- llvm/lib/Target/ARC/CMakeLists.txt
+++ llvm/lib/Target/ARC/CMakeLists.txt
@@ -26,6 +26,7 @@
   ARCRegisterInfo.cpp
   ARCSubtarget.cpp
   ARCTargetMachine.cpp
+  ARCUtil.cpp
 
   LINK_COMPONENTS
   Analysis
Index: llvm/lib/Target/ARC/MCTargetDesc/ARCMCUtil.h
===================================================================
--- /dev/null
+++ llvm/lib/Target/ARC/MCTargetDesc/ARCMCUtil.h
@@ -0,0 +1,62 @@
+//===- ARCMCUtil.h
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file defines utility functions and classes used throughout
+/// the ARC MC and code generator layers.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARC_MCTARGETDESC_ARCMCUTIL_H
+#define LLVM_LIB_TARGET_ARC_MCTARGETDESC_ARCMCUTIL_H
+
+#include "ARC.h"
+
+namespace llvm {
+#if 0
+} fix emacs;
+#endif
+
+namespace ARC {
+#if 0
+} fix emacs;
+#endif
+
+// Get load/store RS9 form
+unsigned getConversionToRS9(unsigned);
+
+// Get load/store AW form (pre-increment)
+unsigned getConversionToAW(unsigned);
+
+// Get load/store AB form (post-increment)
+unsigned getConversionToAB(unsigned);
+
+// Get load/store RRAW form (indexed pre-increment)
+unsigned getConversionToRRAW(unsigned);
+
+// Get load/store RRAB form (indexed post-increment)
+unsigned getConversionToRRAB(unsigned);
+
+// Get load/store from indexed to RRAB form (indexed post-increment)
+unsigned getConversionFromRRToRRAB(unsigned);
+
+// Get load/store from scaled indexed to RRAB form (indexed post-increment)
+unsigned getConversionFromScaledToRRAB(unsigned);
+
+// This is adapted from the previous implementation of
+// isLoadStoreThatCanHandleDisplacement. That implementation excluded pre/post
+// indexed forms, presumably to avoid modifying those forms. Instead, we add a
+// parameter to allow these forms or not.
+// TODO: the no-di cases have irs9 forms based on the current TD files, but are
+// omitted here. is this right?
+bool isS9LoadStoreOpcode(unsigned Opc, int &LogScale, bool AllowInc);
+
+} // End namespace ARC
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARC_MCTARGETDESC_ARCMCUTIL_H
Index: llvm/lib/Target/ARC/MCTargetDesc/ARCMCUtil.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/ARC/MCTargetDesc/ARCMCUtil.cpp
@@ -0,0 +1,205 @@
+//===- ARCMCUtil.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file defines utility functions and classes used throughout
+/// the ARC MC and code generator layers.
+//===----------------------------------------------------------------------===//
+
+#include "ARCMCUtil.h"
+#include "../ARC.h"
+
+using namespace llvm;
+
+// Get load/store RS9 form
+unsigned ARC::getConversionToRS9(unsigned OP) {
+  // TODO upstream indexed addressing mode
+  return 0;
+}
+
+// Get load/store AW form (pre-increment)
+unsigned ARC::getConversionToAW(unsigned OP) {
+  // TODO upstream ARC64 and FPU opcodes
+  // TODO upstream signed and unsigned limm offsets
+  switch (OP) {
+#undef CASE
+#define CASE(OP)                                                               \
+  case ARC::OP##_rs9:                                                          \
+    return ARC::OP##_AW_rs9;                                                   \
+  case ARC::OP##_DI_rs9:                                                       \
+    return ARC::OP##_DI_AW_rs9
+    // CASE(LDU);
+    CASE(LDB_X);
+    // CASE(WIDE_LDSB);
+    CASE(LDH_X);
+    // CASE(WIDE_LDSH);
+    CASE(LD);
+    // CASE(LDD);
+    CASE(LDB);
+    CASE(LDH);
+    // TODO add immediate W6 forms
+    // CASE(STD);
+    CASE(ST);
+    CASE(STH);
+    CASE(STB);
+    // TODO upstream prefetch & VDSP
+  }
+  return 0;
+}
+
+// Get load/store AB form (post-increment)
+unsigned ARC::getConversionToAB(unsigned OP) {
+  // TODO upstream ARC64 and FPU opcodes
+  // TODO upstream signed and unsigned limm offsets
+  switch (OP) {
+#undef CASE
+#define CASE(OP)                                                               \
+  case ARC::OP##_rs9:                                                          \
+    return ARC::OP##_AB_rs9;                                                   \
+  case ARC::OP##_DI_rs9:                                                       \
+    return ARC::OP##_DI_AB_rs9
+    CASE(LD);
+    CASE(LDB_X);
+    CASE(LDB);
+    CASE(LDH_X);
+    CASE(LDH);
+    // TODO upstream prefetch & VDSP
+  }
+  return 0;
+}
+
+// TODO/FIXME Support all forms of indexed addressing
+
+// Get load/store RRAW form (indexed pre-increment)
+unsigned ARC::getConversionToRRAW(unsigned OP) { return 0; }
+
+// Get load/store RRAB form (indexed post-increment)
+unsigned ARC::getConversionToRRAB(unsigned OP) { return 0; }
+
+// Get load/store from indexexed to RRAB form (indexed post-increment)
+unsigned ARC::getConversionFromRRToRRAB(unsigned OP) { return 0; }
+
+// Get load/store from scaled indexed to RRAB form (indexed post-increment)
+unsigned ARC::getConversionFromScaledToRRAB(unsigned OP) { return 0; }
+
+// This is adapted from the previous implementation of
+// isLoadStoreThatCanHandleDisplacement. That implementation excluded pre/post
+// indexed forms, presumably to avoid modifying those forms. Instead, we add a
+// parameter to allow these forms or not.
+// TODO: the no-di cases have irs9 forms based on the current TD files, but are
+// omitted here. is this right?
+bool ARC::isS9LoadStoreOpcode(unsigned OP, int &LogScale, bool AllowInc) {
+  switch (OP) {
+#undef CASE
+#define CASE(LD, Shift)                                                        \
+  case ARC::LD##_rs9:                                                          \
+  case ARC::LD##_DI_rs9:                                                       \
+    LogScale = Shift;                                                          \
+    return true
+#undef CASE_NO_DI
+#define CASE_NO_DI(LD, Shift)                                                  \
+  case ARC::LD##_rs9:                                                          \
+    LogScale = Shift;                                                          \
+    return true
+    CASE(LDB_X, 0);
+    // TODO CASE(WIDE_LDSB);
+    CASE(LDB, 0);
+    // case ARC::LDSB_rs9_aq:
+    // case ARC::LDUB_rs9_aq:
+    CASE(LDH_X, 1);
+    // CASE(WIDE_LDSH):
+    CASE(LDH, 1);
+    // CASE_NO_DI(FLD16):
+    // case ARC::LDSH_rs9_aq:
+    // case ARC::LDUH_rs9_aq:
+    CASE(LD, 2);
+    // CASE(LDD):
+    // CASE_NO_DI(LDS):
+    // CASE(LDU):
+    // CASE_NO_DI(FLD32):
+    // CASE_NO_DI(FLD64):
+    // CASE_NO_DI(FLD128):
+    // case ARC::LD_rs9_aq:
+    // CASE_NO_DI(LDL):
+    // CASE_NO_DI(LDDL):
+    // case ARC::LDL_rs9_aq:
+    //    LogScale = 3;
+    //    return true;
+
+    // TODO upstream STORE W6 opcodes
+    CASE(ST, 2);
+    // CASE(STD):
+    // CASE_NO_DI(FST128):
+    // CASE_NO_DI(FST64):
+    // CASE_NO_DI(FST32):
+    // case ARC::PREFETCH_rs9:
+    // case ARC::PREFETCHW_rs9:
+    // case ARC::PREALLOC_rs9:
+    // case ARC::ST_rs9_rl:
+    CASE(STH, 1);
+    // CASE_NO_DI(FST16):
+    // case ARC::STH_rs9_rl:
+    CASE(STB, 0);
+    // TODO upstream limmrs9 addr mode
+    // case ARC::ST_limmrs9:
+    // case ARC::STD_limmrs9:
+    // case ARC::STH_limmrs9:
+    // case ARC::STB_limmrs9:
+    // case ARC::ST_DI_limmrs9:
+    // case ARC::STD_DI_limmrs9:
+    // case ARC::STH_DI_limmrs9:
+    // case ARC::STB_DI_limmrs9:
+    // case ARC::STB_rs9_rl:
+  }
+
+  // Only incrementing forms after this point!
+  if (!AllowInc)
+    return false;
+
+  switch (OP) {
+#undef CASE_NO_DI
+#define CASE_NO_DI(OP, Shift)                                                  \
+  case ARC::OP##_AB_rs9:                                                       \
+  case ARC::OP##_AW_rs9:                                                       \
+    LogScale = Shift;                                                          \
+    return true;
+#undef CASE
+#define CASE(OP, Shift)                                                        \
+  case ARC::OP##_AB_rs9:                                                       \
+  case ARC::OP##_AW_rs9:                                                       \
+  case ARC::OP##_DI_AB_rs9:                                                    \
+  case ARC::OP##_DI_AW_rs9:                                                    \
+    LogScale = Shift;                                                          \
+    return true
+    CASE(LDB_X, 0);
+    CASE(LDB, 0);
+    CASE(LDH_X, 1);
+    CASE(LDH, 1);
+    CASE(LD, 2);
+    // CASE(LDD):
+    // CASE_NO_DI(LDS):
+    // CASE(LDU):
+    // CASE_NO_DI(LDL):
+    // CASE_NO_DI(LDDL):
+
+    // TODO upstream W6 and LIMM/SLIMM forms
+    CASE(ST, 2);
+    // CASE(STD):
+    CASE(STH, 1);
+    CASE(STB, 0);
+    // CASE_NO_DI(STL):
+    // CASE_NO_DI(STDL):
+    // CASE_NO_DI(FST16):
+    // CASE_NO_DI(FST32):
+    // CASE_NO_DI(FST64):
+    // CASE_NO_DI(FLD16):
+    // CASE_NO_DI(FLD32):
+    // CASE_NO_DI(FLD64):
+  }
+  return false;
+}
Index: llvm/lib/Target/ARC/MCTargetDesc/CMakeLists.txt
===================================================================
--- llvm/lib/Target/ARC/MCTargetDesc/CMakeLists.txt
+++ llvm/lib/Target/ARC/MCTargetDesc/CMakeLists.txt
@@ -2,6 +2,7 @@
   ARCInstPrinter.cpp
   ARCMCTargetDesc.cpp
   ARCMCAsmInfo.cpp
+  ARCMCUtil.cpp
 
   LINK_COMPONENTS
   MC