Index: include/llvm/Analysis/DivergenceAnalysis.h
===================================================================
--- include/llvm/Analysis/DivergenceAnalysis.h
+++ include/llvm/Analysis/DivergenceAnalysis.h
@@ -13,6 +13,8 @@
 // better decisions.
 //
 //===----------------------------------------------------------------------===//
+#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/Function.h"
@@ -46,3 +48,5 @@
   DenseSet<const Value *> DivergentValues;
 };
 } // End llvm namespace
+
+#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
\ No newline at end of file
Index: include/llvm/CodeGen/FunctionLoweringInfo.h
===================================================================
--- include/llvm/CodeGen/FunctionLoweringInfo.h
+++ include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -118,6 +118,17 @@
   /// cross-basic-block values.
   DenseMap<const Value *, unsigned> ValueMap;
 
+  /// VirtReg2Value map is needed by the Divergence Analysis driven
+  /// instruction selection. It is reverted ValueMap. It is computed
+  /// in lazy style - on demand. It is used to get the Value corresponding
+  /// to the live in virtual register and is called from the
+  /// TargetLowerinInfo::isSDNodeSourceOfDivergence.
+  DenseMap<unsigned, const Value*> VirtReg2Value;
+
+  /// This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence
+  /// to get the Value corresponding to the live-in virtual register.
+  const Value * getValueFromVirtualReg(unsigned Vreg);
+
   /// Track virtual registers created for exception pointers.
   DenseMap<const Value *, unsigned> CatchPadExceptionPointers;
 
Index: include/llvm/CodeGen/SelectionDAG.h
===================================================================
--- include/llvm/CodeGen/SelectionDAG.h
+++ include/llvm/CodeGen/SelectionDAG.h
@@ -28,8 +28,10 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineValueType.h"
@@ -217,6 +219,9 @@
   LLVMContext *Context;
   CodeGenOpt::Level OptLevel;
 
+  DivergenceAnalysis * DA = nullptr;
+  FunctionLoweringInfo * FLI = nullptr;
+
   /// The function-level optimization remark emitter.  Used to emit remarks
   /// whenever manipulating the DAG.
   OptimizationRemarkEmitter *ORE;
@@ -346,19 +351,7 @@
          .getRawSubclassData();
   }
 
-  void createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
-    assert(!Node->OperandList && "Node already has operands");
-    SDUse *Ops = OperandRecycler.allocate(
-        ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
-
-    for (unsigned I = 0; I != Vals.size(); ++I) {
-      Ops[I].setUser(Node);
-      Ops[I].setInitial(Vals[I]);
-    }
-    Node->NumOperands = Vals.size();
-    Node->OperandList = Ops;
-    checkForCycles(Node);
-  }
+  void createOperands(SDNode *Node, ArrayRef<SDValue> Vals);
 
   void removeOperands(SDNode *Node) {
     if (!Node->OperandList)
@@ -378,7 +371,12 @@
 
   /// Prepare this SelectionDAG to process code in the given MachineFunction.
   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
-            Pass *PassPtr, const TargetLibraryInfo *LibraryInfo);
+            Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
+            DivergenceAnalysis * DA);
+
+  void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) {
+    FLI = FuncInfo;
+  }
 
   /// Clear state and free memory necessary to make this
   /// SelectionDAG ready to process a new block.
Index: include/llvm/CodeGen/SelectionDAGNodes.h
===================================================================
--- include/llvm/CodeGen/SelectionDAGNodes.h
+++ include/llvm/CodeGen/SelectionDAGNodes.h
@@ -466,11 +466,14 @@
     friend class SDNode;
     friend class MemIntrinsicSDNode;
     friend class MemSDNode;
+    friend class SelectionDAG;
+    friend class SelectionDAGBuilder;
 
     uint16_t HasDebugValue : 1;
     uint16_t IsMemIntrinsic : 1;
+    uint16_t IsDivergent : 1;
   };
-  enum { NumSDNodeBits = 2 };
+  enum { NumSDNodeBits = 3 };
 
   class ConstantSDNodeBitfields {
     friend class ConstantSDNode;
@@ -548,6 +551,8 @@
   // TODO: unfriend HandleSDNode once we fix its operand handling.
   friend class HandleSDNode;
 
+  friend class SelectionDAGBuilder;
+
   /// Unique id per SDNode in the DAG.
   int NodeId = -1;
 
@@ -662,6 +667,8 @@
   bool getHasDebugValue() const { return SDNodeBits.HasDebugValue; }
   void setHasDebugValue(bool b) { SDNodeBits.HasDebugValue = b; }
 
+  bool isDivergent() const { return SDNodeBits.IsDivergent; }
+
   /// Return true if there are no uses of this node.
   bool use_empty() const { return UseList == nullptr; }
 
@@ -997,7 +1004,19 @@
 
   /// This method should only be used by the SDUse class.
   void addUse(SDUse &U) { U.addToList(&UseList); }
-
+  
+  void updateDivergence() {
+    bool IsDivergent = SDNodeBits.IsDivergent;
+    for (auto &Op : ops()) {
+      IsDivergent |= Op.getNode()->isDivergent();
+    }
+    if (SDNodeBits.IsDivergent != IsDivergent) {
+      SDNodeBits.IsDivergent = IsDivergent;
+      for (auto U : uses()) {
+        U->updateDivergence();
+      }
+    }
+  }
 protected:
   static SDVTList getSDVTList(EVT VT) {
     SDVTList Ret = { getValueTypeList(VT), 1 };
Index: include/llvm/CodeGen/TargetLowering.h
===================================================================
--- include/llvm/CodeGen/TargetLowering.h
+++ include/llvm/CodeGen/TargetLowering.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineValueType.h"
@@ -2556,6 +2557,15 @@
 
   bool isPositionIndependent() const;
 
+  virtual bool isSDNodeSourceOfDivergence(const SDNode * N,
+   FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const {
+    return false;
+  }
+
+  virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
+    return false;
+  }
+
   /// Returns true by value, base pointer and offset pointer and addressing mode
   /// by reference if the node's address can be legally represented as
   /// pre-indexed load / store address.
Index: lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -547,3 +547,13 @@
   }
   return std::make_pair(It->second, false);
 }
+
+const Value *
+FunctionLoweringInfo::getValueFromVirtualReg(unsigned Vreg) {
+  if (VirtReg2Value.empty()) {
+    for (auto &P : ValueMap) {
+      VirtReg2Value[P.second] = P.first;
+    }
+  }
+  return VirtReg2Value[Vreg];
+}
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -903,7 +903,8 @@
 
 void SelectionDAG::init(MachineFunction &NewMF,
                         OptimizationRemarkEmitter &NewORE,
-                        Pass *PassPtr, const TargetLibraryInfo *LibraryInfo) {
+                        Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
+                        DivergenceAnalysis * Divergence) {
   MF = &NewMF;
   SDAGISelPass = PassPtr;
   ORE = &NewORE;
@@ -911,6 +912,7 @@
   TSI = getSubtarget().getSelectionDAGInfo();
   LibInfo = LibraryInfo;
   Context = &MF->getFunction().getContext();
+  DA = Divergence;
 }
 
 SelectionDAG::~SelectionDAG() {
@@ -7239,8 +7241,9 @@
       SDUse &Use = UI.getUse();
       ++UI;
       Use.set(To);
+      if (To->isDivergent() != From->isDivergent())
+        User->updateDivergence();
     } while (UI != UE && *UI == User);
-
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
@@ -7294,6 +7297,8 @@
       SDUse &Use = UI.getUse();
       ++UI;
       Use.setNode(To);
+      if (To->isDivergent() != From->isDivergent())
+        User->updateDivergence();
     } while (UI != UE && *UI == User);
 
     // Now that we have modified User, add it back to the CSE maps.  If it
@@ -7338,8 +7343,9 @@
       const SDValue &ToOp = To[Use.getResNo()];
       ++UI;
       Use.set(ToOp);
+      if (To->getNode()->isDivergent() != From->isDivergent())
+        User->updateDivergence();
     } while (UI != UE && *UI == User);
-
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
@@ -7397,8 +7403,9 @@
 
       ++UI;
       Use.set(To);
+      if (To->isDivergent() != From->isDivergent())
+        User->updateDivergence();
     } while (UI != UE && *UI == User);
-
     // We are iterating over all uses of the From node, so if a use
     // doesn't use the specific value, no changes are made.
     if (!UserRemovedFromCSEMaps)
@@ -8236,6 +8243,25 @@
   return nullptr;
 }
 
+void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
+  assert(!Node->OperandList && "Node already has operands");
+  SDUse *Ops = OperandRecycler.allocate(
+    ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
+
+  bool IsDivergent = false;
+  for (unsigned I = 0; I != Vals.size(); ++I) {
+    Ops[I].setUser(Node);
+    Ops[I].setInitial(Vals[I]);
+    IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent();
+  }
+  Node->NumOperands = Vals.size();
+  Node->OperandList = Ops;
+  IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA);
+  if (!TLI->isSDNodeAlwaysUniform(Node))
+    Node->SDNodeBits.IsDivergent = IsDivergent;
+  checkForCycles(Node);
+}
+
 #ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
                                  SmallPtrSetImpl<const SDNode*> &Visited,
Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -628,6 +628,8 @@
 
     if (getNodeId() != -1)
       OS << " [ID=" << getNodeId() << ']';
+    if (!(isa<ConstantSDNode>(this) || (isa<ConstantFPSDNode>(this))))
+      OS << "# D:" << isDivergent();
 
     if (!G)
       return;
Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -414,7 +414,8 @@
 
   SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI);
 
-  CurDAG->init(*MF, *ORE, this, LibInfo);
+  CurDAG->init(*MF, *ORE, this, LibInfo,
+   getAnalysisIfAvailable<DivergenceAnalysis>());
   FuncInfo->set(Fn, *MF, CurDAG);
 
   // Now get the optional analyzes if we want to.
@@ -1401,6 +1402,8 @@
   FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()];
   FuncInfo->InsertPt = FuncInfo->MBB->begin();
 
+  CurDAG->setFunctionLoweringInfo(FuncInfo);
+
   if (!FastIS) {
     LowerArguments(Fn);
   } else {
Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -83,6 +84,7 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AMDGPUArgumentUsageInfo>();
+    AU.addRequired<DivergenceAnalysis>();
     SelectionDAGISel::getAnalysisUsage(AU);
   }
 
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -168,6 +168,11 @@
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
+  bool isSDNodeSourceOfDivergence(const SDNode * N,
+    FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const;
+
+  bool isSDNodeAlwaysUniform(const SDNode * N) const;
+
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -748,6 +748,123 @@
   return true;
 }
 
+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
+  switch (N->getOpcode()) {
+    default:
+    return false;
+    case ISD::INTRINSIC_WO_CHAIN:
+    {
+      unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+      switch (IntrID) {
+        default:
+        return false;
+        case Intrinsic::amdgcn_readfirstlane:
+        case Intrinsic::amdgcn_readlane:
+          return true;
+      }
+    }
+    break;
+  }
+}
+
+bool AMDGPUTargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
+  FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+{
+  switch (N->getOpcode()) {
+    case ISD::CopyFromReg:
+    {
+      if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(N->getOperand(1)))
+      {
+        unsigned Reg = R->getReg();
+        const AMDGPURegisterInfo * RI = Subtarget->getRegisterInfo();
+        if (RI->isPhysicalRegister(Reg)) {
+          return RI->getRegClass(AMDGPU::VGPR_32RegClassID)->contains(Reg) ||
+            RI->getRegClass(AMDGPU::VReg_64RegClassID)->contains(Reg) ||
+            RI->getRegClass(AMDGPU::VReg_96RegClassID)->contains(Reg) ||
+            RI->getRegClass(AMDGPU::VReg_128RegClassID)->contains(Reg) ||
+            RI->getRegClass(AMDGPU::VReg_256RegClassID)->contains(Reg) ||
+            RI->getRegClass(AMDGPU::VReg_512RegClassID)->contains(Reg);
+        }
+        else {
+          // Formal arguments of non-entry functions
+          // are conservatively considered divergent
+          if (FLI->RegInfo->isLiveIn(Reg) &&
+           !AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+            return true;
+          return DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+        }
+      }
+    }
+    break;
+    case ISD::LOAD:
+    {
+      const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
+      if (L->getMemOperand()->getAddrSpace() == Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
+      return true;
+    }
+    break;
+    case ISD::CALLSEQ_END:
+    return true;
+    break;
+    case ISD::INTRINSIC_WO_CHAIN:
+    {
+      unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+      switch (IntrID) {
+        case Intrinsic::amdgcn_workitem_id_x:
+        case Intrinsic::amdgcn_workitem_id_y:
+        case Intrinsic::amdgcn_workitem_id_z:
+        case Intrinsic::r600_read_tidig_x:
+        case Intrinsic::r600_read_tidig_y:
+        case Intrinsic::r600_read_tidig_z:
+        case Intrinsic::amdgcn_interp_p1:
+        case Intrinsic::amdgcn_interp_p2:
+        case Intrinsic::amdgcn_interp_mov:
+          return true;
+      }
+    }
+    break;
+    case ISD::INTRINSIC_W_CHAIN:
+    {
+      unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntrID) {
+        case Intrinsic::amdgcn_mbcnt_hi:
+        case Intrinsic::amdgcn_mbcnt_lo:
+        case Intrinsic::amdgcn_atomic_inc:
+        case Intrinsic::amdgcn_atomic_dec:
+        case Intrinsic::amdgcn_image_atomic_swap:
+        case Intrinsic::amdgcn_image_atomic_add:
+        case Intrinsic::amdgcn_image_atomic_sub:
+        case Intrinsic::amdgcn_image_atomic_smin:
+        case Intrinsic::amdgcn_image_atomic_umin:
+        case Intrinsic::amdgcn_image_atomic_smax:
+        case Intrinsic::amdgcn_image_atomic_umax:
+        case Intrinsic::amdgcn_image_atomic_and:
+        case Intrinsic::amdgcn_image_atomic_or:
+        case Intrinsic::amdgcn_image_atomic_xor:
+        case Intrinsic::amdgcn_image_atomic_inc:
+        case Intrinsic::amdgcn_image_atomic_dec:
+        case Intrinsic::amdgcn_image_atomic_cmpswap:
+        case Intrinsic::amdgcn_buffer_atomic_swap:
+        case Intrinsic::amdgcn_buffer_atomic_add:
+        case Intrinsic::amdgcn_buffer_atomic_sub:
+        case Intrinsic::amdgcn_buffer_atomic_smin:
+        case Intrinsic::amdgcn_buffer_atomic_umin:
+        case Intrinsic::amdgcn_buffer_atomic_smax:
+        case Intrinsic::amdgcn_buffer_atomic_umax:
+        case Intrinsic::amdgcn_buffer_atomic_and:
+        case Intrinsic::amdgcn_buffer_atomic_or:
+        case Intrinsic::amdgcn_buffer_atomic_xor:
+        case Intrinsic::amdgcn_buffer_atomic_cmpswap:
+        case Intrinsic::amdgcn_ps_live:
+        case Intrinsic::amdgcn_ds_swizzle:
+          return true;
+      }
+    }
+    break;
+  }
+  return false;
+}
+
 //===---------------------------------------------------------------------===//
 // Target Properties
 //===---------------------------------------------------------------------===//
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5379,7 +5379,7 @@
 
   unsigned NumElements = MemVT.getVectorNumElements();
   if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
-    if (isMemOpUniform(Load))
+    if (!Op->isDivergent())
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
@@ -5387,7 +5387,7 @@
     //
   }
   if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
-    if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
+    if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
Index: lib/Target/AMDGPU/SMInstructions.td
===================================================================
--- lib/Target/AMDGPU/SMInstructions.td
+++ lib/Target/AMDGPU/SMInstructions.td
@@ -223,11 +223,9 @@
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
   auto Ld = cast<LoadSDNode>(N);
   return Ld->getAlignment() >= 4  &&
-    ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
-    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+    ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && !N->isDivergent()) ||
     (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
-    !Ld->isVolatile() &&
-    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+    !Ld->isVolatile() && !N->isDivergent() &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
 
Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
===================================================================
--- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -2,7 +2,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}use_dispatch_ptr:
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @use_dispatch_ptr() #1 {
   %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
@@ -19,7 +21,9 @@
 }
 
 ; GCN-LABEL: {{^}}use_queue_ptr:
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @use_queue_ptr() #1 {
   %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
@@ -37,11 +41,12 @@
 }
 
 ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
-; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10
+; CIVI: flat_load_dword v[[HI:[0-9]+]], v[0:1]
 ; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]]
-
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
+; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16
+; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
+; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
+; CIVI: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @use_queue_ptr_addrspacecast() #1 {
   %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32*
   store volatile i32 0, i32* %asc
@@ -60,7 +65,9 @@
 }
 
 ; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @use_kernarg_segment_ptr() #1 {
   %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
@@ -424,9 +431,15 @@
 
 ; GCN-LABEL: {{^}}use_every_sgpr_input:
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
-; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
-; GCN: s_load_dword s{{[0-9]+}}, s[10:11], 0x0
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s10
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s11
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GCN: ; use s[12:13]
 ; GCN: ; use s14
 ; GCN: ; use s15
@@ -554,15 +567,26 @@
 ; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9]
 ; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11]
 
-; GCN-DAG: s_mov_b32 s6, s14
+; GCN-DAG: s_mov_b32 s6, s14 
 ; GCN-DAG: s_mov_b32 s7, s15
 ; GCN-DAG: s_mov_b32 s8, s16
+
+; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[6:7]
+; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[8:9]
+; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[10:11]
+
 ; GCN: s_swappc_b64
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
-; GCN: s_load_dword s{{[0-9]+}},
-; GCN: s_load_dword s{{[0-9]+}},
-; GCN: s_load_dword s{{[0-9]+}},
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_X]]
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_X]]
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Y]]
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Y]]
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Z]]
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Z]]
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GCN: ; use
 ; GCN: ; use [[SAVE_X]]
 ; GCN: ; use [[SAVE_Y]]
Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@@ -34,7 +34,13 @@
 
 ; GCN-LABEL: {{^}}func_implicitarg_ptr:
 ; GCN: s_waitcnt
-; GCN-NEXT: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}}
+; MESA: s_mov_b64 s[8:9], s[6:7]
+; MESA: s_mov_b32 s11, 0xf000
+; MESA: s_mov_b32 s10, -1
+; MESA: buffer_load_dword v0, off, s[8:11], 0
+; HSA: v_mov_b32_e32 v0, s6
+; HSA: v_mov_b32_e32 v1, s7
+; HSA: flat_load_dword v0, v[0:1]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @func_implicitarg_ptr() #1 {
@@ -83,8 +89,21 @@
 
 ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
 ; GCN: s_waitcnt
-; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}}
-; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0{{$}}
+; MESA: s_mov_b64 s[12:13], s[6:7]
+; MESA: s_mov_b32 s15, 0xf000
+; MESA: s_mov_b32 s14, -1
+; MESA: buffer_load_dword v0, off, s[12:15], 0
+; HSA: v_mov_b32_e32 v0, s6
+; HSA: v_mov_b32_e32 v1, s7
+; HSA: flat_load_dword v0, v[0:1]
+; MESA: s_mov_b32 s10, s14
+; MESA: s_mov_b32 s11, s15
+; MESA: buffer_load_dword v0, off, s[8:11], 0
+; HSA: v_mov_b32_e32 v0, s8
+; HSA: v_mov_b32_e32 v1, s9
+; HSA: flat_load_dword v0, v[0:1]
+
+; GCN: s_waitcnt vmcnt(0)
 define void @func_kernarg_implicitarg_ptr() #1 {
   %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
Index: test/CodeGen/AMDGPU/spill-m0.ll
===================================================================
--- test/CodeGen/AMDGPU/spill-m0.ll
+++ test/CodeGen/AMDGPU/spill-m0.ll
@@ -156,47 +156,39 @@
 }
 
 ; GCN-LABEL: {{^}}restore_m0_lds:
-; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
-; TOSMEM: s_cmp_eq_u32
+
 ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
 ; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
 ; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x300
 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
 ; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_cbranch_scc1
 
+; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
 ; TOSMEM: s_mov_b32 m0, -1
 
-; TOSMEM: s_mov_b32 s0, m0
-; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
-; TOSMEM: s_mov_b32 m0, s0
-; TOSMEM: s_waitcnt lgkmcnt(0)
-
 ; TOSMEM: ds_write_b64
 
 ; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x300
-; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
+; TOSMEM: s_add_u32 m0, s3, 0x100
+; TOSMEM: s_buffer_load_dword s2, s[88:91], m0 ; 4-byte Folded Reload
 ; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
 ; TOSMEM-NOT: m0
-; TOSMEM: s_mov_b32 m0, s0
+; TOSMEM: s_mov_b32 m0, s2
 ; TOSMEM: ; use m0
 
 ; TOSMEM: s_dcache_wb
 ; TOSMEM: s_endpgm
 define amdgpu_kernel void @restore_m0_lds(i32 %arg) {
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
-  %sval = load volatile i64, i64 addrspace(2)* undef
+  %sval = load i64, i64 addrspace(2)* undef
   %cmp = icmp eq i32 %arg, 0
   br i1 %cmp, label %ret, label %bb
 
 bb:
-  store volatile i64 %sval, i64 addrspace(3)* undef
+  store i64 %sval, i64 addrspace(3)* undef
   call void asm sideeffect "; use $0", "{M0}"(i32 %m0) #0
   br label %ret