Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -21,10 +21,12 @@
 #include "R600MachineScheduler.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
+#include "SIMachineScheduler.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -47,9 +49,17 @@
   return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
 }
 
+static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
+  return new SIScheduleDAGMI(C);
+}
+
+static MachineSchedRegistry
+R600SchedRegistry("r600", "Run R600's custom scheduler",
+                   createR600MachineScheduler);
+
 static MachineSchedRegistry
-SchedCustomRegistry("r600", "Run R600's custom scheduler",
-                    createR600MachineScheduler);
+SISchedRegistry("si", "Run SI's custom scheduler",
+                createSIMachineScheduler);
 
 static std::string computeDataLayout(const Triple &TT) {
   std::string Ret = "e-p:32:32";
@@ -121,6 +131,8 @@
     const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       return createR600MachineScheduler(C);
+    //else //(uncomment to turn default for SI)
+    //  return createSIMachineScheduler(C);
     return nullptr;
   }
 
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -53,6 +53,7 @@
   SILowerControlFlow.cpp
   SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
+  SIMachineScheduler.cpp
   SIPrepareScratchRegs.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -345,6 +345,8 @@
 
   uint64_t getDefaultRsrcDataFormat() const;
 
+  bool isLowLatencyInstruction(const MachineInstr *MI) const;
+  bool isHighLatencyInstruction(const MachineInstr *MI) const;
 };
 
 namespace AMDGPU {
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2717,3 +2717,19 @@
 
   return RsrcDataFormat;
 }
+
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  if (isSMRD(Opc))
+    return true;
+  return false;
+}
+
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  if (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc))
+    return true;
+  return false;
+}
Index: lib/Target/AMDGPU/SIMachineScheduler.h
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SIMachineScheduler.h
@@ -0,0 +1,396 @@
+//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+enum SIScheduleCandReason {
+    NoCand, RegUsage, Latency, Successor, Depth, NodeOrder};
+
+struct SISchedulerCandidate {
+  // The reason for this candidate.
+  SIScheduleCandReason Reason;
+
+  // Set of reasons that apply to multiple candidates.
+  uint32_t RepeatReasonSet;
+
+  SISchedulerCandidate()
+    :  Reason(NoCand), RepeatReasonSet(0) {}
+
+  bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
+  void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
+};
+
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
+
+class SIScheduleBlock {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator *BC;
+
+  std::vector<SUnit*> SUnits;
+  std::map<unsigned, unsigned> NodeNum2Index;
+  std::vector<SUnit*> TopReadySUs;
+  std::vector<SUnit*> ScheduledSUnits;
+
+  /// The top of the unscheduled zone.
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  std::vector<unsigned> InternalAdditionnalPressure;
+  std::vector<unsigned> LiveInPressure;
+  std::vector<unsigned> LiveOutPressure;
+  std::set<unsigned> LiveInRegs;
+  std::set<unsigned> LiveOutRegs;
+
+  bool Scheduled;
+  bool HighLatencyBlock;
+
+  std::vector<unsigned> HasLowLatencyNonWaitedParent;
+
+public:
+  SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
+                  unsigned ID):
+    DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(),
+    TopRPTracker(TopPressure), Scheduled(false),
+    HighLatencyBlock(false), ID(ID),
+    Preds(), Succs() {};
+
+  ~SIScheduleBlock() {};
+
+  // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table
+  unsigned ID;
+
+  /// functions for Block construction
+  void addUnit(SUnit *SU);
+  // when all SUs have been added
+  void finalizeUnits();
+  // Add block pred, which has instruction predecessor of SU.
+  void addPred(SIScheduleBlock *Pred);
+  void addSucc(SIScheduleBlock *Succ);
+
+  std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
+  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
+
+  unsigned NumHighLatencySuccessors = 0;
+
+  bool isHighLatencyBlock() {return HighLatencyBlock;}
+  // This is approximative.
+  // Ideally should take into accounts some instructions (rcp, etc)
+  // are 4 times slower
+  int getCost() {return SUnits.size();}
+
+  // The block Predecessors and Successors must be all registered
+  // before schedule()
+  // fast schedule with no particular requirement
+  void fastSchedule();
+  std::vector<SUnit*> getScheduledUnits() {return ScheduledSUnits;}
+
+  // Complete schedule that will take try to minimize reg pressure and
+  // low latencies, and will fill correct track of liveins and liveouts.
+  // Needs all MIs to be groupped between BeginBlock and EndBlock.
+  // The MIs can be moved after the scheduling.
+  // It is just used to allow precise computation.
+  void schedule(MachineBasicBlock::iterator BeginBlock,
+                MachineBasicBlock::iterator EndBlock);
+  bool isScheduled() {return Scheduled;}
+
+
+  // Needs the block to be scheduled inside
+  // TODO: find a way to compute it.
+  std::vector<unsigned> &getInternalAdditionnalRegUsage() {
+    return InternalAdditionnalPressure;
+  }
+
+  std::set<unsigned> &getInRegs() {return LiveInRegs;}
+  std::set<unsigned> &getOutRegs() {return LiveOutRegs;}
+
+  void printDebug(bool Full);
+
+private:
+  struct SISchedCandidate : SISchedulerCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    unsigned SGPRUsage;
+    unsigned VGPRUsage;
+    bool IsLowLatency;
+    unsigned LowLatencyOffset;
+    bool HasLowLatencyNonWaitedParent;
+
+    SISchedCandidate()
+      : SU(nullptr) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SISchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      SGPRUsage = Best.SGPRUsage;
+      VGPRUsage = Best.VGPRUsage;
+      IsLowLatency = Best.IsLowLatency;
+      LowLatencyOffset = Best.LowLatencyOffset;
+      HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
+    }
+  };
+
+  void undoSchedule();
+  // InOrOutBlock: restrict to links pointing inside the block (true),
+  // or restrict to links pointing outside the block (false).
+  void undoReleaseSucc(SUnit *SU, SDep *SuccEdge, bool InOrOutBlock);
+  void releaseSucc(SUnit *SU, SDep *SuccEdge, bool InOrOutBlock);
+  void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
+  void NodeScheduled(SUnit *SU);
+  void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  SUnit* pickNode();
+  void traceCandidate(const SISchedCandidate &Cand);
+  void initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                       MachineBasicBlock::iterator EndBlock);
+};
+
+enum SISchedulerBlockCreatorVariant {
+    LatenciesAlone};
+
+class SIScheduleBlockCreator {
+  SIScheduleDAGMI *DAG;
+  // unique_ptr handles freeing memory for us
+  std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
+  std::map<SISchedulerBlockCreatorVariant,
+           std::vector<SIScheduleBlock*>> Blocks;
+  std::vector<SIScheduleBlock*> CurrentBlocks;
+  std::vector<int> Node2CurrentBlock;
+
+  /// Topological sort
+  /// Index2Node - Maps topological index to the node number.
+  std::vector<int> TopDownIndex2Node;
+  std::vector<int> BottomUpIndex2Node;
+  /// Node2Index - Maps the node number to its topological index.
+  std::vector<int> TopDownNode2Index;
+  std::vector<int> BottomUpNode2Index;
+
+  // 0 -> color not given
+  // 1 to SUnits.size() -> Reserved group (you should only add elements to them)
+  // above -> other groups
+  int NextReservedID;
+  int NextNonReservedID;
+  std::vector<int> CurrentColoring;
+  std::vector<int> CurrentTopDownReservedDependencyColoring;
+  std::vector<int> CurrentBottomUpReservedDependencyColoring;
+
+public:
+  SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
+  ~SIScheduleBlockCreator();
+
+  std::vector<SIScheduleBlock*>
+  getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
+
+  bool isSUInBlock(SUnit *SU, unsigned ID);
+
+private:
+  void topologicalSort();
+  // Give a Reserved color to every high latency
+  void colorHighLatenciesAlone();
+  // Creates groups of high latencies with a Reserved color
+  void colorHighLatenciesGroups();
+  // Compute coloring for topdown and bottom traversals with
+  // different colors depending on dependencies on Reserved colors
+  void colorComputeReservedDependencies();
+  // Gives Color to all non-colored SUs according to Reserved groups dependencies
+  void colorAccordingToReservedDependencies();
+  // Cut groups into groups with SUs in consecutive order (except for Reserved groups)
+  void colorForceConsecutiveOrderInGroup();
+  // Merge Constant loads that have all their users into another group to the group
+  // (TODO: else if all their users depend on the same group, put them there)
+  void colorMergeConstantLoadsNextGroup();
+  // Merge SUs that have all their users into another group to the group
+  void colorMergeIfPossibleNextGroup();
+  // Merge SUs that have all their users into another group to the group, but only for Reserved groups
+  void colorMergeIfPossibleNextGroupOnlyForReserved();
+  void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
+  void scheduleInsideBlocks();
+};
+
+enum SISchedulerBlockSchedulerVariant {
+    BlockLatency};
+
+class SIScheduleBlockScheduler {
+  SIScheduleDAGMI *DAG;
+  SISchedulerBlockSchedulerVariant Variant;
+  std::vector<SIScheduleBlock*> Blocks;
+
+  std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
+  std::set<unsigned> LiveRegs;
+  // num of schedulable unscheduled blocks reading the register
+  std::map<unsigned, unsigned> LiveRegsConsumers;
+
+  std::vector<unsigned> LastPosHighLatencyParentScheduled;
+
+  std::vector<SIScheduleBlock*> BlocksScheduled;
+  unsigned NumBlockScheduled;
+  std::vector<SIScheduleBlock*> ReadyBlocks;
+
+  unsigned VregCurrentUsage;
+  unsigned SregCurrentUsage;
+
+  // Currently is only approximation,
+  // it will contain the maximum register usage
+  // between two blocks instead of the maximum
+  // register usage in general
+  unsigned maxVregUsage;
+  unsigned maxSregUsage;
+
+  std::vector<unsigned> BlockNumPredsLeft;
+  std::vector<unsigned> BlockNumSuccsLeft;
+
+public:
+  SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                           SISchedulerBlockSchedulerVariant Variant,
+                           std::vector<SIScheduleBlock*> Blocks);
+  ~SIScheduleBlockScheduler() {};
+
+  std::vector<SIScheduleBlock*> getBlocks() {return BlocksScheduled;};
+
+  unsigned getVGPRUsage() {return maxVregUsage;};
+  unsigned getSGPRUsage() {return maxSregUsage;};
+
+private:
+  struct SIBlockSchedCandidate : SISchedulerCandidate {
+    // The best Block candidate.
+    SIScheduleBlock *Block;
+
+    bool IsHighLatency;
+    int VGPRUsageDiff;
+    unsigned NumHighLatencySuccessors;
+    unsigned LastPosHighLatParentScheduled;
+
+    SIBlockSchedCandidate()
+      : Block(nullptr) {}
+
+    bool isValid() const { return Block; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SIBlockSchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      Block = Best.Block;
+      Reason = Best.Reason;
+      IsHighLatency = Best.IsHighLatency;
+      VGPRUsageDiff = Best.VGPRUsageDiff;
+      NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
+      LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
+    }
+  };
+
+  void tryCandidate(SIBlockSchedCandidate &Cand,
+                    SIBlockSchedCandidate &TryCand);
+  SIScheduleBlock *pickBlock();
+
+  void addLiveRegs(std::set<unsigned> &Regs);
+  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+  void releaseBlockSuccs(SIScheduleBlock *Parent);
+  void blockScheduled(SIScheduleBlock *Block);
+
+  // Check register pressure change
+  // by scheduling a block with these LiveIn and LiveOut
+  std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                       std::set<unsigned> &OutRegs);
+
+  void schedule();
+};
+
+class SIScheduler {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator BlockCreator;
+
+public:
+  SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {};
+
+  ~SIScheduler() {};
+
+  std::vector<unsigned>
+  scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                  SISchedulerBlockSchedulerVariant ScheduleVariant);
+};
+
+class SIScheduleDAGMI : public ScheduleDAGMILive {
+  const SIInstrInfo *SITII;
+  const SIRegisterInfo *SITRI;
+
+  std::vector<SUnit> SUnitsLinksBackup;
+
+  // For moveLowLatencies. After all Scheduling variants are tested.
+  std::vector<unsigned> ScheduledSUnits;
+  std::vector<unsigned> ScheduledSUnitsInv;
+
+public:
+  SIScheduleDAGMI(MachineSchedContext *C);
+
+  ~SIScheduleDAGMI() override;
+
+  // entry point for the schedule
+  void schedule() override;
+
+  // To init Block's RPTracker
+  void initRPTracker(RegPressureTracker &RPTracker) {
+    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  }
+
+  MachineBasicBlock *getBB() {return BB;}
+  MachineBasicBlock::iterator getCurrentTop() {return CurrentTop;};
+  MachineBasicBlock::iterator getCurrentBottom() {return CurrentBottom;};
+  LiveIntervals *getLIS() {return LIS;}
+  MachineRegisterInfo *getMRI() {return &MRI;}
+  const TargetRegisterInfo *getTRI() {return TRI;}
+  SUnit* getEntrySU() {return &EntrySU;};
+  SUnit* getExitSU() {return &ExitSU;};
+
+  void restoreSULinksLeft();
+
+  template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
+                                                     _Iterator End,
+                                                     unsigned &VgprUsage,
+                                                     unsigned &SgprUsage);
+  std::set<unsigned> getInRegs() {
+    std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(),
+                               RPTracker.getPressure().LiveInRegs.end());
+    return InRegs;
+  };
+
+private:
+  // After scheduling is done, improve low latency placements
+  void moveLowLatencies();
+
+public:
+  unsigned VGPRSetID;
+  unsigned SGPRSetID;
+  // Some stats for scheduling inside blocks
+  std::vector<unsigned> IsLowLatencySU;
+  std::vector<unsigned> LowLatencyOffset;
+  std::vector<unsigned> IsHighLatencySU;
+};
+
+} // namespace llvm
+
+#endif /* SIMACHINESCHEDULER_H_ */
+
Index: lib/Target/AMDGPU/SIMachineScheduler.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -0,0 +1,1707 @@
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIMachineScheduler.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This scheduler implements a different scheduling algorithm than
+// GenericScheduler.
+//
+// There are several specific architecture behaviours that can't be modelled
+// for GenericScheduler:
+// . When accessing the result of an SGPR load instruction, you have to wait
+// for all the SGPR load instructions before your current instruction to
+// have finished.
+// . When accessing the result of an VGPR load instruction, you have to wait
+// for all the VGPR load instructions previous to the VGPR load instruction
+// you are interested in to finish.
+// . The less the register pressure, the best load latencies are hidden
+//
+// Moreover some specifities (like the fact a lot of instructions in the shader
+// have few dependencies) makes the generic scheduler have some unpredictable
+// behaviours. For example when register pressure becomes high, it can either
+// manage to prevent register pressure from going to high, or it can
+// increase register pressure even more than if it hadn't taken register
+// pressure into account.
+//
+// Also some other bad behaviours are generated, like loading at the beginning
+// of the shader a constant in VGPR you won't need until the end of the shader.
+//
+// The scheduling problem for SI can distinguish three main parts:
+// . Hiding high latencies (texture sampling, etc)
+// . Hiding low latencies (SGPR constant loading, etc)
+// . Keeping register usage low for better latency hiding and general
+//   performance
+//
+// This scheduler tries to solve the scheduling problem by dividing it into
+// simpler sub-problems. It divides the instructions into blocks, schedules
+// locally inside the blocks where it takes care of low latencies, and then
+// chooses the order of the blocks by taking care of high latencies.
+// Dividing the instructions into blocks helps control keeping register
+// usage low.
+//
+// First the instructions are put into blocks.
+//   We want the blocks help control register usage and hide high latencies
+//   later. To help control register usage, we typically want all local
+//   computations, when for example you create a result that can be comsummed
+//   right away, to be contained in a block. Block inputs and outputs would
+//   typically be important results that are needed in several locations of
+//   the shader. Since we do want blocks to help hide high latencies, we want
+//   the instructions inside the block to have a minimal set of dependencies
+//   on high latencies. It will make it easy to pick blocks to hide specific
+//   latencies.
+//   The current block creation algorithm puts instructions that depend on
+//   the same high latency instructions together, and adds constant loadings
+//   to the blocks that use them if they are the only user. This algorithm
+//   can be improved.
+//
+// Second the order of the instructions inside the blocks is choosen.
+//   At that step we do take into account only register usage and hiding
+//   low latency instructions
+//
+// Third the block order is choosen, there we try to hide high latencies
+// and keep register usage low.
+//
+// After the third step, a pass is done to improve the hiding of low
+// latencies.
+//
+// Actually when talking about 'low latency' or 'high latency' it includes
+// both the latency to get the cache (or global mem) data go to the register,
+// and the bandwith limitations.
+// Increasing the number of active wavefronts helps hide the former, but it
+// doesn't solve the latter, thus why even if wavefront count is high, we have
+// to try have as many instructions hiding high latencies as possible.
+// The OpenCL doc says for example latency of 400 cycles for a global mem access,
+// which is hidden by 10 instructions if the wavefront count is 10.
+
+// Some figures taken from AMD docs:
+// Both texture and constant L1 caches are 4-way associative with 64 bytes
+// lines.
+// Constant cache is shared with 4 CUs.
+// For texture sampling, the address generation unit receives 4 texture
+// addresses per cycle, thus we could expect texture sampling latency to be
+// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
+// instructions in a wavefront group are executed every 4 cycles),
+// or 16 instructions if the other wavefronts associated to the 3 other VALUs
+// of the CU do texture sampling too. (Don't take these figures too seriously,
+// as I'm not 100% sure of the computation)
+// Data exports should get similar latency.
+// For constant loading, the cache is shader with 4 CUs.
+// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
+// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
+// It means a simple s_buffer_load should take one instruction to hide, as
+// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
+// cache line.
+//
+// As of today the driver doesn't preload the constants in cache, thus the
+// first loads get extra latency. The doc says global memory access can be
+// 300-600 cycles. We do not specially take that into account when scheduling
+// As we expect the driver to be able to preload the constants soon.
+
+
+// common code //
+
+#ifndef NDEBUG
+
+const char *getReasonStr(SIScheduleCandReason Reason)  {
+  switch (Reason) {
+  case NoCand:         return "NOCAND    ";
+  case RegUsage:       return "REGUSAGE  ";
+  case Latency:        return "LATENCY   ";
+  case Successor:      return "SUCCESSOR ";
+  case Depth:          return "DEPTH     ";
+  case NodeOrder:      return "ORDER     ";
+  };
+  llvm_unreachable("Unknown reason!");
+}
+
+#endif
+
+static bool tryLess(int TryVal, int CandVal,
+                    SISchedulerCandidate &TryCand,
+                    SISchedulerCandidate &Cand,
+                    SIScheduleCandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+                       SISchedulerCandidate &TryCand,
+                       SISchedulerCandidate &Cand,
+                       SIScheduleCandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+// SIScheduleBlock //
+
+void SIScheduleBlock::addUnit(SUnit *SU) {
+  NodeNum2Index[SU->NodeNum] = SUnits.size();
+  SUnits.push_back(SU);
+}
+
+#ifndef NDEBUG
+
+void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
+
+  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  dbgs() << '\n';
+}
+#endif
+
+void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
+                                          SISchedCandidate &TryCand) {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  // Schedule low latency instructions as top as possible
+  // Order of priority is:
+  // . Low latency instructions which do not depend on low latencies we
+  //   haven't waited
+  // . Other instructions which do not depend on low latencies we haven't
+  //   waited
+  // . Low latencies
+  // . All other instructions
+  // Goal is to get: low latency instructions - independant instructions
+  //     - (eventually some more low latency instructions)
+  //     - instructions that depend on the first low latency instructions.
+  // If never in the block there is a lot of constant loads, the SGPR usage
+  // could go quite high, thus above the arbitrary limit of 60 will encourage
+  // use the already loaded constants (in order to release some SGPRs) before
+  // loading more
+  if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
+              Cand.HasLowLatencyNonWaitedParent, TryCand, Cand, Depth))
+    return;
+
+  if (Cand.SGPRUsage > 60 &&
+      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+                 TryCand, Cand, Depth))
+    return;
+
+  if (TryCand.IsLowLatency &&
+      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+              TryCand, Cand, Depth))
+    return;
+
+  if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Fall through to original instruction order.
+  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+    TryCand.Reason = NodeOrder;
+  }
+}
+
+SUnit* SIScheduleBlock::pickNode() {
+  SISchedCandidate TopCand;
+
+  for (std::vector<SUnit*>::iterator I = TopReadySUs.begin(),
+       E = TopReadySUs.end(); I != E; ++I) {
+    SISchedCandidate TryCand;
+    std::vector<unsigned> pressure = TopRPTracker.getRegSetPressureAtPos();
+    std::vector<unsigned> MaxPressure = TopRPTracker.getRegSetPressureAtPos();
+    // predict register usage after this instruction
+    TopRPTracker.getDownwardPressure((*I)->getInstr(), pressure, MaxPressure);
+    TryCand.SU = *I;
+    TryCand.SGPRUsage = pressure[DAG->SGPRSetID];
+    TryCand.VGPRUsage = pressure[DAG->VGPRSetID];
+    TryCand.IsLowLatency = DAG->IsLowLatencySU[(*I)->NodeNum];
+    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[(*I)->NodeNum];
+    TryCand.HasLowLatencyNonWaitedParent =
+      HasLowLatencyNonWaitedParent[NodeNum2Index[(*I)->NodeNum]];
+    tryCandidateTopDown(TopCand, TryCand);
+    if (TryCand.Reason != NoCand) {
+      TopCand.setBest(TryCand);
+      //DEBUG(traceCandidate(Cand));
+    }
+  }
+
+  return TopCand.SU;
+}
+
+
+// Schedule something valid.
+void SIScheduleBlock::fastSchedule() {
+  TopReadySUs.clear();
+  if (Scheduled)
+    undoSchedule();
+
+  for (std::vector<SUnit*>::iterator I = SUnits.begin(),
+       E = SUnits.end(); I != E; ++I) {
+    SUnit *SU = *I;
+
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = TopReadySUs[0];
+    ScheduledSUnits.push_back(SU);
+    NodeScheduled(SU);
+  }
+
+  Scheduled = true;
+}
+
+// Returns if the register was set between first and last
+static bool findDefBetween(unsigned Reg,
+                           SlotIndex First, SlotIndex Last,
+                           const MachineRegisterInfo *MRI,
+                           const LiveIntervals *LIS) {
+  for (MachineRegisterInfo::def_instr_iterator
+       UI = MRI->def_instr_begin(Reg),
+       UE = MRI->def_instr_end(); UI != UE; ++UI) {
+    const MachineInstr* MI = &*UI;
+    if (MI->isDebugValue())
+      continue;
+    SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+    if (InstSlot >= First && InstSlot <= Last)
+      return true;
+  }
+  return false;
+}
+
+void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                                      MachineBasicBlock::iterator EndBlock) {
+  IntervalPressure Pressure, BotPressure;
+  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
+  LiveIntervals *LIS = DAG->getLIS();
+  MachineRegisterInfo *MRI = DAG->getMRI();
+  DAG->initRPTracker(TopRPTracker);
+  DAG->initRPTracker(BotRPTracker);
+  DAG->initRPTracker(RPTracker);
+
+  // Goes though all SU. RPTracker captures what had to be alive for the SUs
+  // to execute, and what is still alive at the end
+  for (std::vector<SUnit*>::iterator I = ScheduledSUnits.begin(),
+       E = ScheduledSUnits.end(); I != E; ++I) {
+    RPTracker.setPos((*I)->getInstr());
+    RPTracker.advance();
+  }
+
+  // Close the RPTracker to finalize live ins/outs.
+  RPTracker.closeRegion();
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Do not Track Physical Registers, because it messes up
+  for (SmallVector<unsigned, 8>::iterator I =
+       RPTracker.getPressure().LiveInRegs.begin(),
+       E = RPTracker.getPressure().LiveInRegs.end(); I != E; ++I) {
+    unsigned Reg = *I;
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      LiveInRegs.insert(Reg);
+  }
+  LiveOutRegs.clear();
+  // There is several possibilities to distinguish:
+  // 1) Reg is not input to any instruction in the block, but is output of one
+  // 2) 1) + read in the block and not needed after it
+  // 3) 1) + read in the block but needed in another block
+  // 4) Reg is input of an instruction but another block will read it too
+  // 5) Reg is input of an instruction and then rewritten in the block.
+  //    result is not read in the block (implies used in another block)
+  // 6) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block and not needed in another block
+  // 7) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block but also needed in another block
+  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
+  // We want LiveOutRegs to contain only Regs whose content will be read after
+  // in another block, and whose content was written in the current block,
+  // that is we want it to get 1, 3, 5, 7
+  // Since we made the MIs of a block to be packed all together before
+  // scheduling, then the LiveIntervals were correct, and the RPTracker was
+  // able to correctly handle 5 vs 6, 2 vs 3.
+  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
+  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
+  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+  // The use of findDefBetween removes the case 4
+  for (SmallVector<unsigned, 8>::iterator I =
+       RPTracker.getPressure().LiveOutRegs.begin(),
+       E = RPTracker.getPressure().LiveOutRegs.end(); I != E; ++I) {
+    unsigned Reg = *I;
+    if (findDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(),
+                       LIS->getInstructionIndex(EndBlock).getRegSlot(),
+                       MRI, LIS) &&
+        TargetRegisterInfo::isVirtualRegister(Reg)) {
+      LiveOutRegs.insert(Reg);
+    }
+  }
+
+  // Pressure = sum_alive_registers register size
+  // Internally llvm will represent some registers as big 128 bits registers
+  // for example, but they actually correspond to 4 actual 32 bits registers.
+  // Thus Pressure is not equal to num_alive_registers * constant.
+  LiveInPressure = TopPressure.MaxSetPressure;
+  LiveOutPressure = BotPressure.MaxSetPressure;
+
+  // Prepares TopRPTracker for top down scheduling.
+  TopRPTracker.closeTop();
+}
+
+void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
+                               MachineBasicBlock::iterator EndBlock) {
+  if (!Scheduled)
+    fastSchedule();
+
+  // PreScheduling phase to set LiveIn and LiveOut
+  initRegPressure(BeginBlock, EndBlock);
+  undoSchedule();
+
+  // Schedule for real now
+
+  TopReadySUs.clear();
+
+  for (std::vector<SUnit*>::iterator I = SUnits.begin(),
+       E = SUnits.end(); I != E; ++I) {
+    SUnit *SU = *I;
+
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = pickNode();
+    ScheduledSUnits.push_back(SU);
+    TopRPTracker.setPos(SU->getInstr());
+    TopRPTracker.advance();
+    NodeScheduled(SU);
+  }
+
+  // TODO: compute InternalAdditionnalPressure
+  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+
+  // Check everything is right
+#ifndef NDEBUG
+  assert(SUnits.size() == ScheduledSUnits.size());
+  assert(TopReadySUs.empty());
+  for (std::vector<SUnit*>::iterator I = SUnits.begin(),
+       E = SUnits.end(); I != E; ++I) {
+    SUnit *SU = *I;
+    assert (SU->isScheduled);
+    assert (SU->NumPredsLeft == 0);
+  }
+#endif
+
+  Scheduled = true;
+}
+
+void SIScheduleBlock::undoSchedule() {
+  for (std::vector<SUnit*>::iterator I = SUnits.begin(),
+       E = SUnits.end(); I != E; ++I) {
+    SUnit *SU = *I;
+    SU->isScheduled = false;
+    for (SUnit::succ_iterator I2 = SU->Succs.begin(),
+         E = SU->Succs.end(); I2 != E; ++I2) {
+      undoReleaseSucc(SU, &*I2, true);
+    }
+  }
+  ScheduledSUnits.clear();
+  Scheduled = false;
+}
+
+void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge,
+                                      bool InOrOutBlock) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+    return;
+
+  if (SuccEdge->isWeak()) {
+    ++SuccSU->WeakPredsLeft;
+    return;
+  }
+  ++SuccSU->NumPredsLeft;
+}
+
+void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge,
+                                  bool InOrOutBlock) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+    return;
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(DAG);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+
+  --SuccSU->NumPredsLeft;
+  if (SuccSU->NumPredsLeft == 0 && InOrOutBlock && !SuccSU->isScheduled)
+    TopReadySUs.push_back(SuccSU);
+}
+
+/// Release Successors of the SU that are in the block or not
+void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    releaseSucc(SU, &*I, InOrOutBlock);
+  }
+}
+
+void SIScheduleBlock::NodeScheduled(SUnit *SU) {
+  // Is in TopReadySUs
+  assert (!SU->NumPredsLeft);
+  std::vector<SUnit*>::iterator I =
+    std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
+  if (I == TopReadySUs.end()) {
+    dbgs() << "Data Structure Bug in SI Scheduler\n";
+    llvm_unreachable(nullptr);
+  }
+  TopReadySUs.erase(I);
+
+  releaseSuccessors(SU, true);
+  // scheduling this node will trigger a wait,
+  // thus propagate to other instructions that they do not need to wait either
+  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) {
+    for (unsigned i = 0; i < HasLowLatencyNonWaitedParent.size(); ++i)
+      HasLowLatencyNonWaitedParent[i] = 0;
+  }
+  if (DAG->IsLowLatencySU[SU->NodeNum]) {
+    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I) {
+      std::map<unsigned, unsigned>::iterator I2 = NodeNum2Index.find(SU->NodeNum);
+      if (I2 != NodeNum2Index.end())
+        HasLowLatencyNonWaitedParent[I2->second] = 1;
+    }
+  }
+  SU->isScheduled = true;
+}
+
+void SIScheduleBlock::finalizeUnits() {
+  // We remove links from outside blocks to enable scheduling inside the block
+  for (std::vector<SUnit*>::iterator I = SUnits.begin(),
+       E = SUnits.end(); I != E; ++I) {
+    releaseSuccessors(*I, false);
+    if (DAG->IsHighLatencySU[(*I)->NodeNum])
+      HighLatencyBlock = true;
+  }
+  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
+}
+
+// we maintain ascending order of IDs
+void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
+  unsigned PredID = Pred->ID;
+
+  // check if not already predecessor
+  for (std::vector<SIScheduleBlock*>::iterator I = Preds.begin(),
+       E = Preds.end(); I != E; ++I) {
+    if (PredID == (*I)->ID)
+      return;
+  }
+  Preds.push_back(Pred);
+  DEBUG(
+    for (std::vector<SIScheduleBlock*>::iterator I = Succs.begin(),
+         E = Succs.end(); I != E; ++I) {
+      if (PredID == (*I)->ID)
+        assert(!"Loop in the Block Graph!\n");
+    }
+  );
+}
+
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+  unsigned SuccID = Succ->ID;
+
+  // check if not already predecessor
+  for (std::vector<SIScheduleBlock*>::iterator I = Succs.begin(),
+       E = Succs.end(); I != E; ++I) {
+    if (SuccID == (*I)->ID)
+      return;
+  }
+  if (Succ->isHighLatencyBlock())
+    ++NumHighLatencySuccessors;
+  Succs.push_back(Succ);
+  DEBUG(
+    for (std::vector<SIScheduleBlock*>::iterator I = Preds.begin(),
+         E = Preds.end(); I != E; ++I) {
+      if (SuccID == (*I)->ID)
+        assert("Loop in the Block Graph!\n");
+    }
+  );
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::printDebug(bool full) {
+  dbgs() << "Block (" << ID << ")\n";
+  if (!full)
+    return;
+
+  dbgs() << "\nContains High Latency Instruction: "
+         << HighLatencyBlock << "\n";
+  dbgs() << "\nDepends On:\n";
+  for (std::vector<SIScheduleBlock*>::iterator I = Preds.begin(),
+       E = Preds.end(); I != E; ++I) {
+    (*I)->printDebug(false);
+  }
+
+  dbgs() << "\nSuccessors:\n";
+  for (std::vector<SIScheduleBlock*>::iterator I = Succs.begin(),
+       E = Succs.end(); I != E; ++I) {
+    (*I)->printDebug(false);
+  }
+
+  if (Scheduled) {
+    dbgs() << "LiveInPressure " << LiveInPressure[DAG->SGPRSetID] << " "
+           << LiveInPressure[DAG->VGPRSetID] << "\n";
+    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->SGPRSetID] << " "
+           << LiveOutPressure[DAG->VGPRSetID] << "\n\n";
+    dbgs() << "LiveIns:\n";
+    for (std::set<unsigned>::iterator RegI = LiveInRegs.begin(),
+         E = LiveInRegs.end(); RegI != E; ++RegI) {
+      unsigned Reg = *RegI;
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << " ";
+    }
+    dbgs() << "\nLiveOuts:\n";
+    for (std::set<unsigned>::iterator RegO = LiveOutRegs.begin(),
+         E = LiveOutRegs.end(); RegO != E; ++RegO) {
+      unsigned Reg = *RegO;
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << " ";
+    }
+  }
+
+  dbgs() << "\nInstructions:\n";
+  if (!Scheduled) {
+    for (std::vector<SUnit*>::iterator I = SUnits.begin(),
+         E = SUnits.end(); I != E; ++I) {
+      (*I)->dump(DAG);
+    }
+  } else {
+    for (std::vector<SUnit*>::iterator I = ScheduledSUnits.begin(),
+         E = ScheduledSUnits.end(); I != E; ++I) {
+      (*I)->dump(DAG);
+    }
+  }
+
+   dbgs() << "///////////////////////\n";
+}
+
+#endif
+
+// SIScheduleBlockCreator //
+
+SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
+DAG(DAG) {
+  topologicalSort();
+}
+
+SIScheduleBlockCreator::~SIScheduleBlockCreator() {
+}
+
+std::vector<SIScheduleBlock*>
+SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
+  std::map<SISchedulerBlockCreatorVariant, std::vector<SIScheduleBlock*>>::iterator B = Blocks.find(BlockVariant);
+  if (B == Blocks.end()) {
+    createBlocksForVariant(BlockVariant);
+    scheduleInsideBlocks();
+    Blocks[BlockVariant] = CurrentBlocks;
+    return CurrentBlocks;
+  } else {
+    return B->second;
+  }
+}
+
+bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
+  if (SU->NodeNum >= DAG->SUnits.size())
+    return false;
+  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->ID == ID;
+}
+
+// Code adapted from scheduleDAG.cpp
+// Does a topological sort over the SUs.
+// Both TopDown and BottomUp
+void SIScheduleBlockCreator::topologicalSort() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::vector<SUnit*> WorkList;
+
+  DEBUG(dbgs() << "Topological Sort\n");
+  WorkList.reserve(DAGSize);
+
+  TopDownIndex2Node.resize(DAGSize);
+  TopDownNode2Index.resize(DAGSize);
+  BottomUpIndex2Node.resize(DAGSize);
+  BottomUpNode2Index.resize(DAGSize);
+
+  if (DAG->getExitSU())
+    WorkList.push_back(DAG->getExitSU());
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Succs.size();
+    TopDownNode2Index[NodeNum] = Degree;
+    if (Degree == 0) {
+      assert(SU->Succs.empty() && "SUnit should have no successors");
+      WorkList.push_back(SU);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    if (SU->NodeNum < DAGSize) {
+      TopDownNode2Index[SU->NodeNum] = --Id;
+      TopDownIndex2Node[Id] = SU->NodeNum;
+    }
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      SUnit *SU = I->getSUnit();
+      if (SU->NodeNum < DAGSize && !--TopDownNode2Index[SU->NodeNum])
+        WorkList.push_back(SU);
+    }
+  }
+
+  WorkList.clear();
+
+  if (DAG->getEntrySU())
+    WorkList.push_back(DAG->getEntrySU());
+  for (int i = DAGSize-1; i >= 0; --i) {
+    SUnit *SU = &DAG->SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Preds.size();
+    BottomUpNode2Index[NodeNum] = Degree;
+    if (Degree == 0) {
+      assert(SU->Preds.empty() && "SUnit should have no successors");
+      WorkList.push_back(SU);
+    }
+  }
+
+  Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    if (SU->NodeNum < DAGSize) {
+      BottomUpNode2Index[SU->NodeNum] = --Id;
+      BottomUpIndex2Node[Id] = SU->NodeNum;
+    }
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *SU = I->getSUnit();
+      if (SU->NodeNum < DAGSize && !--BottomUpNode2Index[SU->NodeNum])
+        WorkList.push_back(SU);
+    }
+  }
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      if (I->getSUnit()->NodeNum >= DAGSize)
+        continue;
+      assert(TopDownNode2Index[SU->NodeNum] >
+             TopDownNode2Index[I->getSUnit()->NodeNum] &&
+             "Wrong Top Down topological sorting");
+    }
+  }
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      if (I->getSUnit()->NodeNum >= DAGSize)
+        continue;
+      assert(BottomUpNode2Index[SU->NodeNum] >
+             BottomUpNode2Index[I->getSUnit()->NodeNum] &&
+             "Wrong Bottom Up topological sorting");
+    }
+  }
+#endif
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesAlone() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      CurrentColoring[SU->NodeNum] = NextReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesGroups() {
+  llvm_unreachable("Not yet Implemented");
+}
+
+void SIScheduleBlockCreator::colorComputeReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+  CurrentTopDownReservedDependencyColoring.clear();
+  CurrentBottomUpReservedDependencyColoring.clear();
+
+  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
+  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
+
+  // Traverse TopDown, and give different colors to SUs depending
+  // on which combination of High Latencies they depend on.
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[TopDownIndex2Node[i]];
+    std::set<unsigned> SUColors;
+
+    // Already given
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] = CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      SUnit *Pred = I->getSUnit();
+      if (I->isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
+        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
+    }
+    // color 0 by default
+    if (SUColors.size() == 0)
+      continue;
+    // same color than parents
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] = *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] = NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+
+  ColorCombinations.clear();
+
+  // Same than before, but BottomUp.
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[BottomUpIndex2Node[i]];
+    std::set<unsigned> SUColors;
+
+    // Already given
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *Succ = I->getSUnit();
+      if (I->isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
+    }
+    // keep color 0
+    if (SUColors.size() == 0)
+      continue;
+    // same color than parents
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+          CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+  // Every combination of colors given by the top down
+  // and bottom up Reserved node dependency
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    std::set<unsigned> SUColors;
+
+    // High latency instructions: already given
+    if (CurrentColoring[SU->NodeNum])
+      continue;
+
+    SUColors.insert(CurrentTopDownReservedDependencyColoring[SU->NodeNum]);
+    SUColors.insert(CurrentBottomUpReservedDependencyColoring[SU->NodeNum]);
+
+    std::map<std::set<unsigned>, unsigned>::iterator Pos =
+      ColorCombinations.find(SUColors);
+    if (Pos != ColorCombinations.end()) {
+      CurrentColoring[SU->NodeNum] = Pos->second;
+    } else {
+      CurrentColoring[SU->NodeNum] = NextNonReservedID;
+      ColorCombinations[SUColors] = NextNonReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
+  llvm_unreachable("Not yet Implemented");
+}
+
+void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[BottomUpIndex2Node[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    /* No predecessor: vgpr constant loading */
+    /* low latency instruction usually have a predecessor (the address) */
+    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
+      continue;
+
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *Succ = I->getSUnit();
+      if (I->isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[BottomUpIndex2Node[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *Succ = I->getSUnit();
+      if (I->isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[BottomUpIndex2Node[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *Succ = I->getSUnit();
+      if (I->isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned,unsigned> RealID;
+
+  CurrentBlocks.clear();
+  CurrentColoring.clear();
+  CurrentColoring.resize(DAGSize, 0);
+
+  // Restore links previous scheduling variant has overridden
+  DAG->restoreSULinksLeft();
+
+  NextReservedID = 1;
+  NextNonReservedID = DAGSize + 1;
+
+  DEBUG(dbgs() << "Coloring the graph\n");
+
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlone) {
+    colorHighLatenciesAlone();
+    colorComputeReservedDependencies();
+    colorAccordingToReservedDependencies();
+    colorMergeConstantLoadsNextGroup();
+    colorMergeIfPossibleNextGroupOnlyForReserved();
+  } else
+    llvm_unreachable(nullptr);
+
+  // Put SUs of same color into same block
+  Node2CurrentBlock.resize(DAGSize, -1);
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned Color = CurrentColoring[SU->NodeNum];
+    if (RealID.find(Color) == RealID.end()) {
+      int ID = CurrentBlocks.size();
+      BlockPtrs.push_back(
+        make_unique<SIScheduleBlock>(DAG, this, ID));
+      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
+      RealID[Color] = ID;
+    }
+    CurrentBlocks[RealID[Color]]->addUnit(SU);
+    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
+  }
+
+  // Build dependencies between blocks
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    int SUID = Node2CurrentBlock[i];
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *Succ = I->getSUnit();
+      if (I->isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+    }
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      SUnit *Pred = I->getSUnit();
+      if (I->isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
+    }
+  }
+
+  // free root and leafs of all blocks to enable scheduling inside them
+  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->finalizeUnits();
+  }
+  DEBUG(dbgs() << "Blocks created:\n\n");
+  DEBUG(
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+// two functions taken from Codegen/MachineScheduler.cpp
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  // Cast the return value to nonconst MachineInstr, then cast to an
+  // instr_iterator, which does not check for null, finally return a
+  // bundle_iterator.
+  return MachineBasicBlock::instr_iterator(
+    const_cast<MachineInstr*>(
+      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+void SIScheduleBlockCreator::scheduleInsideBlocks() {
+  unsigned DAGSize = CurrentBlocks.size();
+  std::vector<int> WorkList;
+  std::vector<int> TopDownIndex2Indice;
+  std::vector<int> TopDownIndice2Index;
+
+  DEBUG(dbgs() << "\nScheduling Blocks\n\n"); //TODO mettre nom variance
+
+  // We do schedule a valid scheduling such that a Block corresponds
+  // to a range of instructions. That will guarantee that if a result
+  // produced inside the Block, but also used inside the Block, while it
+  // gets used in another Block, then it will be correctly counted in the
+  // Live Output Regs of the Block
+  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->fastSchedule();
+  }
+
+  WorkList.reserve(DAGSize);
+
+  TopDownIndex2Indice.resize(DAGSize);
+  TopDownIndice2Index.resize(DAGSize);
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    unsigned Degree = Block->Succs.size();
+    TopDownIndice2Index[i] = Degree;
+    if (Degree == 0) {
+      WorkList.push_back(i);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    int i = WorkList.back();
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    WorkList.pop_back();
+    TopDownIndice2Index[i] = --Id;
+    TopDownIndex2Indice[Id] = i;
+    for (std::vector<SIScheduleBlock*>::iterator I = Block->Preds.begin(),
+         E = Block->Preds.end(); I != E; ++I) {
+      SIScheduleBlock *Pred = *I;
+      if (!--TopDownIndice2Index[Pred->ID])
+        WorkList.push_back(Pred->ID);
+    }
+  }
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    for (std::vector<SIScheduleBlock*>::iterator I = Block->Preds.begin(),
+         E = Block->Preds.end(); I != E; ++I) {
+      SIScheduleBlock *Pred = *I;
+      assert(TopDownIndice2Index[i] > TopDownIndice2Index[Pred->ID] &&
+      "Wrong Top Down topological sorting");
+    }
+  }
+#endif
+
+  // Note: the following code, and the part restoring previous position
+  // is by far the most expensive operation of the Scheduler.
+
+  // Do not update CurrentTop
+  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
+  std::vector<MachineBasicBlock::iterator> PosOld;
+  std::vector<MachineBasicBlock::iterator> PosNew;
+  PosOld.reserve(DAG->SUnits.size());
+  PosNew.reserve(DAG->SUnits.size());
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Indice[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (std::vector<SUnit*>::iterator I = SUs.begin(),
+         E = SUs.end(); I != E; ++I) {
+      MachineInstr *MI = (*I)->getInstr();
+      MachineBasicBlock::iterator Pos = MI;
+      PosOld.push_back(Pos);
+      if (&*CurrentTopFastSched == MI) {
+        PosNew.push_back(Pos);
+        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
+                                          DAG->getCurrentBottom());
+      } else {
+        // Update the instruction stream.
+        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
+
+        // Update LiveIntervals
+        DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true);
+        PosNew.push_back(CurrentTopFastSched);
+      }
+    }
+  }
+
+  // Now we have Block of SUs == Block of MI
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
+  }
+
+  DEBUG(dbgs() << "Restoring MI Pos\n");
+  // Restore old ordering (which guarantees LIS->handleMove to be happy)
+  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
+    MachineBasicBlock::iterator POld = PosOld[i-1];
+    MachineBasicBlock::iterator PNew = PosNew[i-1];
+    if (PNew != POld) {
+      // Update the instruction stream.
+      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
+
+      // Update LiveIntervals
+      DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true);
+    }
+  }
+
+  DEBUG(
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+// SIScheduleBlockScheduler //
+
+SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                                                   SISchedulerBlockSchedulerVariant Variant,
+                                                   std::vector<SIScheduleBlock*> Blocks) :
+  DAG(DAG), Variant(Variant), Blocks(Blocks), NumBlockScheduled(0), maxVregUsage(0), maxSregUsage(0) {
+  // Fill the usage of every output
+  // By construction we have that for every Reg input
+  // of a block, it is present one time max in the outputs
+  // of its Block predecessors. If it is not, it means it Reads
+  // a register whose content was already filled at start
+  LiveOutRegsNumUsages.resize(Blocks.size());
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (std::set<unsigned>::iterator RegI = Block->getInRegs().begin(),
+         E = Block->getInRegs().end(); RegI != E; ++RegI) {
+      unsigned Reg = *RegI;
+      for (std::vector<SIScheduleBlock*>::iterator I = Block->Preds.begin(),
+           E = Block->Preds.end(); I != E; ++I) {
+        SIScheduleBlock *Pred = *I;
+        std::map<unsigned, unsigned>::iterator RegPos =
+          LiveOutRegsNumUsages[Pred->ID].find(Reg);
+        if (RegPos != LiveOutRegsNumUsages[Pred->ID].end()) {
+          ++LiveOutRegsNumUsages[Pred->ID][Reg];
+          break;
+        } else {
+          LiveOutRegsNumUsages[Pred->ID][Reg] = 1;
+        }
+      }
+    }
+  }
+
+  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
+  BlockNumPredsLeft.resize(Blocks.size());
+  BlockNumSuccsLeft.resize(Blocks.size());
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    BlockNumPredsLeft[i] = Block->Preds.size();
+    BlockNumSuccsLeft[i] = Block->Succs.size();
+  }
+
+  DEBUG(
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = Blocks[i];
+      assert(Block->ID == i);
+    }
+  );
+
+  std::set<unsigned> InRegs = DAG->getInRegs();
+  addLiveRegs(InRegs);
+
+  // fill LiveRegsConsumers for regs that were already
+  // defined before scheduling
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (std::set<unsigned>::iterator RegI = Block->getInRegs().begin(),
+         E = Block->getInRegs().end(); RegI != E; ++RegI) {
+      unsigned Reg = *RegI;
+      bool Found = false;
+      for (std::vector<SIScheduleBlock*>::iterator I = Block->Preds.begin(),
+           E = Block->Preds.end(); I != E; ++I) {
+        SIScheduleBlock *Pred = *I;
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          break;
+        }
+      }
+
+      if (!Found) {
+        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
+          LiveRegsConsumers[Reg] = 1;
+        else
+          ++LiveRegsConsumers[Reg];
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    if (BlockNumPredsLeft[i] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+  }
+
+  if (Variant != SISchedulerBlockSchedulerVariant::BlockLatency)
+    llvm_unreachable(nullptr);
+
+  while (SIScheduleBlock *Block = pickBlock()) {
+    BlocksScheduled.push_back(Block);
+    blockScheduled(Block);
+  }
+
+  DEBUG(
+    dbgs() << "Block Order:";
+    for (std::vector<SIScheduleBlock*>::iterator I = BlocksScheduled.begin(),
+         E = BlocksScheduled.end(); I != E; ++I) {
+      dbgs() << " " << (*I)->ID;
+    }
+  );
+}
+
+void SIScheduleBlockScheduler::tryCandidate(SIBlockSchedCandidate &Cand,
+                                            SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  // Try not to increase VGPR usage too much, else we may spill
+  if (VregCurrentUsage > 180 &&
+      tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+              TryCand, Cand, RegUsage))
+    return;
+  // Try to hide high latencies
+  if (tryLess(TryCand.LastPosHighLatParentScheduled,
+              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+    return;
+  // schedules high latencies early so you can hide them better
+  if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+                 TryCand, Cand, Latency))
+    return;
+  if (tryGreater(TryCand.NumHighLatencySuccessors,
+                 Cand.NumHighLatencySuccessors,
+                 TryCand, Cand, Successor))
+    return;
+  if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+              TryCand, Cand, RegUsage))
+    return;
+}
+
+SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
+  SIBlockSchedCandidate Cand;
+  std::vector<SIScheduleBlock*>::iterator Best;
+  SIScheduleBlock *Block;
+  if (ReadyBlocks.size() == 0)
+    return nullptr;
+
+  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), VregCurrentUsage, SregCurrentUsage);
+  if (VregCurrentUsage > maxVregUsage)
+    maxVregUsage = VregCurrentUsage;
+  if (VregCurrentUsage > maxSregUsage)
+    maxSregUsage = VregCurrentUsage;
+  DEBUG(
+    dbgs() << "Picking New Blocks\n";
+    dbgs() << "Available: ";
+    for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+         E = ReadyBlocks.end(); I != E; ++I)
+      dbgs() << (*I)->ID << " ";
+    dbgs() << "\nCurrent Live:\n";
+    for (std::set<unsigned>::iterator I = LiveRegs.begin(),
+         E = LiveRegs.end(); I != E; ++I)
+      dbgs() << PrintVRegOrUnit(*I, DAG->getTRI()) << " ";
+    dbgs() << "\n";
+    dbgs() << "Current VGPRs: " << VregCurrentUsage << "\n";
+    dbgs() << "Current SGPRs: " << SregCurrentUsage << "\n";
+  );
+
+  Cand.Block = nullptr;
+  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+       E = ReadyBlocks.end(); I != E; ++I) {
+    SIBlockSchedCandidate TryCand;
+    TryCand.Block = *I;
+    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
+    TryCand.VGPRUsageDiff =
+      checkRegUsageImpact((*I)->getInRegs(), (*I)->getOutRegs())[DAG->VGPRSetID];
+    TryCand.NumHighLatencySuccessors = TryCand.Block->NumHighLatencySuccessors;
+    TryCand.LastPosHighLatParentScheduled = LastPosHighLatencyParentScheduled[(*I)->ID];
+    tryCandidate(Cand, TryCand);
+    if (TryCand.Reason != NoCand) {
+      Cand.setBest(TryCand);
+      Best = I;
+      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->ID << " "
+                   << getReasonStr(Cand.Reason) << "\n");
+    }
+  }
+
+  DEBUG(
+    dbgs() << "Picking: " << Cand.Block->ID << "\n";
+    if (Cand.IsHighLatency)
+      dbgs() << "Is a block with high latency instruction\n";
+    dbgs() << "Position of last high latency dependency: "
+           << Cand.LastPosHighLatParentScheduled << "\n";
+    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << "\n";
+    dbgs() << "\n";
+  );
+
+  Block = Cand.Block;
+  ReadyBlocks.erase(Best);
+  return Block;
+}
+
+// Tracking of currently alive register to determine VGPR Usage
+
+void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+  for (std::set<unsigned>::iterator RegI = Regs.begin(),
+       E = Regs.end(); RegI != E; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // if not already in the live set, then add it
+    (void) LiveRegs.insert(Reg);
+  }
+}
+
+void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
+                                       std::set<unsigned> &Regs) {
+  for (std::set<unsigned>::iterator RegI = Regs.begin(),
+       E = Regs.end(); RegI != E; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+    assert (Pos != LiveRegs.end()); // Reg must be live
+    assert (LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end());
+    assert (LiveRegsConsumers[Reg] >= 1);
+    --LiveRegsConsumers[Reg];
+    if (LiveRegsConsumers[Reg] == 0)
+      LiveRegs.erase(Pos);
+  }
+}
+
+void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
+  for (std::vector<SIScheduleBlock*>::iterator I = Parent->Succs.begin(),
+       E = Parent->Succs.end(); I != E; ++I) {
+    SIScheduleBlock *Block = *I;
+    --BlockNumPredsLeft[Block->ID];
+    if (BlockNumPredsLeft[Block->ID] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+    if (Parent->isHighLatencyBlock())
+      LastPosHighLatencyParentScheduled[Block->ID] = NumBlockScheduled;
+  }
+}
+
+void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
+  decreaseLiveRegs(Block, Block->getInRegs());
+  addLiveRegs(Block->getOutRegs());
+  releaseBlockSuccs(Block);
+  for (std::map<unsigned, unsigned>::iterator RegI =
+       LiveOutRegsNumUsages[Block->ID].begin(),
+       E = LiveOutRegsNumUsages[Block->ID].end(); RegI != E; ++RegI) {
+    std::pair<unsigned, unsigned> RegP = *RegI;
+    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
+      LiveRegsConsumers[RegP.first] = RegP.second;
+    else
+      LiveRegsConsumers[RegP.first] += RegP.second;
+  }
+  ++NumBlockScheduled;
+}
+
+std::vector<int>
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                     std::set<unsigned> &OutRegs) {
+  std::vector<int> DiffSetPressure;
+  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
+
+  for (std::set<unsigned>::iterator RegI = InRegs.begin(),
+       E = InRegs.end(); RegI != E; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (LiveRegsConsumers[Reg] > 1)
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] -= PSetI.getWeight();
+    }
+  }
+
+  for (std::set<unsigned>::iterator RegI = OutRegs.begin(),
+       E = OutRegs.end(); RegI != E; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] += PSetI.getWeight();
+    }
+  }
+
+  return DiffSetPressure;
+}
+
+// SIScheduler //
+
+std::vector<unsigned>
+SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
+  std::vector<SIScheduleBlock*> Blocks = BlockCreator.getBlocks(BlockVariant);
+  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
+  std::vector<unsigned> Res;
+
+  Blocks = Scheduler.getBlocks();
+
+  for (unsigned b = 0; b < Blocks.size(); ++b) {
+    SIScheduleBlock *Block = Blocks[b];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (std::vector<SUnit*>::iterator I = SUs.begin(),
+         E = SUs.end(); I != E; ++I) {
+      Res.push_back((*I)->NodeNum);
+    }
+  }
+
+  return Res;
+}
+
+// SIScheduleDAGMI //
+
+SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
+  ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
+  SITII = static_cast<const SIInstrInfo*>(TII);
+  SITRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  VGPRSetID = SITRI->getVGPR32PressureSet();
+  SGPRSetID = SITRI->getSGPR32PressureSet();
+}
+
+SIScheduleDAGMI::~SIScheduleDAGMI() {
+}
+
+// Move low latencies further from their user without
+// increasing SGPR usage (in general)
+// This is to be replaced by a better pass that would
+// take into account SGPR usage (based on VGPR Usage
+// and the corresponding wavefront count), that would
+// try to merge groups of loads if it make sense, etc
+void SIScheduleDAGMI::moveLowLatencies() {
+   unsigned DAGSize = SUnits.size();
+   int LastLowLatencyUser = -1;
+   int LastLowLatencyPos = -1;
+
+   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[ScheduledSUnits[i]];
+    bool IsLowLatencyUser = false;
+    unsigned MinPos = 0;
+
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      SUnit *Pred = I->getSUnit();
+      if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
+        IsLowLatencyUser = true;
+      }
+      if (Pred->NodeNum >= DAGSize)
+        continue;
+      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
+      if (PredPos >= MinPos)
+        MinPos = PredPos + 1;
+    }
+
+    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+      unsigned BestPos = LastLowLatencyUser + 1;
+      if ((int)BestPos <= LastLowLatencyPos)
+        BestPos = LastLowLatencyPos + 1;
+      if (BestPos < MinPos)
+        BestPos = MinPos;
+      if (BestPos < i) {
+        for (unsigned u = i; u > BestPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[BestPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
+      }
+      LastLowLatencyPos = BestPos;
+      if (IsLowLatencyUser)
+        LastLowLatencyUser = BestPos;
+    } else if (IsLowLatencyUser) {
+      LastLowLatencyUser = i;
+    // Moves COPY instructions on which depends
+    // the low latency instructions too
+    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
+      bool CopyForLowLat = false;
+      for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+           E = SU->Succs.end(); I != E; ++I) {
+        SUnit *Succ = I->getSUnit();
+        if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
+          CopyForLowLat = true;
+        }
+      }
+      if (!CopyForLowLat)
+        continue;
+      if (MinPos < i) {
+        for (unsigned u = i; u > MinPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[MinPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
+      }
+    }
+  }
+}
+
+void SIScheduleDAGMI::restoreSULinksLeft() {
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
+    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
+    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
+    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
+  }
+}
+
+// return the vgpr usage corresponding to some virtual registers
+template<typename _Iterator> void
+SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
+                                  unsigned &VgprUsage, unsigned &SgprUsage) {
+  VgprUsage = 0;
+  SgprUsage = 0;
+  for (_Iterator RegI = First; RegI != End; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = MRI.getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      if (*PSetI == VGPRSetID)
+        VgprUsage += PSetI.getWeight();
+      else if (*PSetI == SGPRSetID)
+        SgprUsage += PSetI.getWeight();
+    }
+  }
+}
+
+void SIScheduleDAGMI::schedule()
+{
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  DEBUG(dbgs() << "Preparing Scheduling\n");
+
+  buildDAGWithRegPressure();
+  DEBUG(for (std::vector<SUnit>::iterator I = SUnits.begin(),
+             E = SUnits.end(); I != E; ++I)
+          (*I).dumpAll(this));
+
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
+  // functions, but to make them happy we must initialize
+  // the default Scheduler implementation (even if we do not
+  // run it)
+  SchedImpl->initialize(this);
+  initQueues(TopRoots, BotRoots);
+
+  // fill some stats to help scheduling
+
+  SUnitsLinksBackup = SUnits;
+  IsLowLatencySU.clear();
+  LowLatencyOffset.clear();
+  IsHighLatencySU.clear();
+
+  IsLowLatencySU.resize(SUnits.size(), 0);
+  LowLatencyOffset.resize(SUnits.size(), 0);
+  IsHighLatencySU.resize(SUnits.size(), 0);
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    unsigned BaseLatReg, OffLatReg;
+    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+      IsLowLatencySU[i] = 1;
+      if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
+                                      OffLatReg, TRI))
+        LowLatencyOffset[i] = OffLatReg;
+    } else if (SITII->isHighLatencyInstruction(SU->getInstr()))
+      IsHighLatencySU[i] = 1;
+  }
+
+  SIScheduler Scheduler(this);
+  ScheduledSUnits = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
+                                              SISchedulerBlockSchedulerVariant::BlockLatency);
+  ScheduledSUnitsInv.resize(SUnits.size());
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
+  }
+
+  moveLowLatencies();
+
+  // tell the outside world about the result of the scheduling
+
+  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+  TopRPTracker.setPos(CurrentTop);
+
+  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
+       E = ScheduledSUnits.end(); I != E; ++I) {
+    SUnit *SU = &SUnits[*I];
+
+    scheduleMI(SU, true);
+
+    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                 << *SU->getInstr());
+  }
+
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = begin()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
Index: lib/Target/AMDGPU/SIRegisterInfo.h
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.h
+++ lib/Target/AMDGPU/SIRegisterInfo.h
@@ -24,6 +24,12 @@
 
 struct SIRegisterInfo : public AMDGPURegisterInfo {
 private:
+  unsigned SGPRsForWaveFronts[10];
+  unsigned SGPRsForWaveFrontsVI[10];
+  unsigned VGPRsForWaveFronts[10];
+  unsigned SGPR32SetID;
+  unsigned VGPR32SetID;
+
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
 
 public:
@@ -122,6 +128,16 @@
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass *RC) const;
 
+  unsigned getSGPR32PressureSet() const { return SGPR32SetID;};
+  unsigned getVGPR32PressureSet() const { return VGPR32SetID;};
+
+  unsigned getWaveFrontsForUsage(AMDGPUSubtarget::Generation gen,
+                                 unsigned SGPRsUsed,
+                                 unsigned VGPRsUsed) const;
+  unsigned spillCost(AMDGPUSubtarget::Generation gen,
+                     unsigned SGPRsUsed,
+                     unsigned VGPRsUsed) const;
+
 private:
   void buildScratchLoadStore(MachineBasicBlock::iterator MI,
                              unsigned LoadStoreOp, unsigned Value,
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -24,7 +24,19 @@
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {}
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
+  for (unsigned i = 0; i < 10; ++i) {
+    SGPRsForWaveFronts[i] = getNumSGPRsAllowed(AMDGPUSubtarget::SOUTHERN_ISLANDS, i+1);
+    SGPRsForWaveFrontsVI[i] = getNumSGPRsAllowed(AMDGPUSubtarget::VOLCANIC_ISLANDS, i+1);
+    VGPRsForWaveFronts[i] = getNumVGPRsAllowed(i+1);
+  }
+  for (unsigned i = 0; i < getNumRegPressureSets(); ++i) {
+    if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0)
+      SGPR32SetID = i;
+    else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
+      VGPR32SetID = i;
+  }
+}
 
 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
   MCRegAliasIterator R(Reg, this, true);
@@ -537,3 +549,43 @@
     }
   }
 }
+
+unsigned SIRegisterInfo::getWaveFrontsForUsage(AMDGPUSubtarget::Generation gen,
+                                               unsigned SGPRsUsed,
+                                               unsigned VGPRsUsed) const {
+  unsigned i;
+  const unsigned int *SGPRsForWaveFrontsForGen;
+
+  if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    SGPRsForWaveFrontsForGen = &SGPRsForWaveFrontsVI[0];
+  else
+    SGPRsForWaveFrontsForGen = &SGPRsForWaveFronts[0];
+
+  for (i = 9; i > 0; --i) {
+    if (SGPRsForWaveFrontsForGen[i] >= SGPRsUsed &&
+        VGPRsForWaveFronts[i] >= VGPRsUsed)
+      break;
+  }
+
+  return i+1;
+}
+
+unsigned SIRegisterInfo::spillCost(AMDGPUSubtarget::Generation gen,
+                                   unsigned SGPRsUsed,
+                                   unsigned VGPRsUsed) const {
+  unsigned cost = 0;
+  const unsigned int *SGPRsForWaveFrontsForGen;
+
+  if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    SGPRsForWaveFrontsForGen = &SGPRsForWaveFrontsVI[0];
+  else
+    SGPRsForWaveFrontsForGen = &SGPRsForWaveFronts[0];
+
+  if (SGPRsForWaveFrontsForGen[0] < SGPRsUsed)
+    cost += SGPRsUsed - SGPRsForWaveFrontsForGen[0];
+  // Spilling VGPR hurts more than SGPR
+  if (VGPRsForWaveFronts[0] < VGPRsUsed)
+    cost += 4*(VGPRsUsed - VGPRsForWaveFronts[0]);
+
+  return cost;
+}
\ No newline at end of file