Index: lib/CodeGen/LiveIntervalAnalysis.cpp
===================================================================
--- lib/CodeGen/LiveIntervalAnalysis.cpp
+++ lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -948,8 +948,12 @@
         LiveInterval &LI = LIS.getInterval(Reg);
         if (LI.hasSubRanges()) {
           unsigned SubReg = MO.getSubReg();
+          LaneBitmask AllSubMask = MRI.getMaxLaneMaskForVReg(Reg);
           LaneBitmask LaneMask = SubReg ? TRI.getSubRegIndexLaneMask(SubReg)
-                                        : MRI.getMaxLaneMaskForVReg(Reg);
+                                        : AllSubMask;
+          // A non-undef subreg def reads other lanes.
+          if (SubReg != 0 && MO.isDef() && !MO.isUndef())
+            LaneMask = AllSubMask;
           for (LiveInterval::SubRange &S : LI.subranges()) {
             if ((S.LaneMask & LaneMask) == 0)
               continue;
@@ -988,7 +992,7 @@
       dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
-      handleMoveDown(LR);
+      handleMoveDown(LR, Reg, LaneMask);
     else
       handleMoveUp(LR, Reg, LaneMask);
     DEBUG(dbgs() << "        -->\t" << LR << '\n');
@@ -997,7 +1001,7 @@
 
   /// Update LR to reflect an instruction has been moved downwards from OldIdx
   /// to NewIdx (OldIdx < NewIdx).
-  void handleMoveDown(LiveRange &LR) {
+  void handleMoveDown(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
     LiveRange::iterator E = LR.end();
     // Segment going into OldIdx.
     LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
@@ -1034,8 +1038,28 @@
           LiveRange::iterator Prev = std::prev(NewIdxIn);
           Prev->end = NewIdx.getRegSlot();
         }
-        // Extend OldIdxIn.
-        OldIdxIn->end = Next->start;
+        // Terminate OldIdxIn. The new end of that segment is not necessarily
+        // directly adjacent to the start of the new one. Take this case for
+        // example:
+        //   1008B    %vreg131:sub1<def> = COPY %vreg54
+        //   1040B    %vreg131:sub2<def> = COPY %vreg23
+        //   1072B    %vreg131:sub3<def> = COPY %vreg24
+        //   1104B    %vreg131:sub4<def> = COPY %vreg25
+        //   1200B    ... = COPY vreg131
+        // The live subrange for vreg131:sub2 will contain
+        //   [...,1008r)[1040r,1200r)
+        //
+        // Consider a move of 1008 to after 1072. This will move the def of
+        // sub1, but the range for other subregisters (including sub2) will
+        // need to be updated as well. The OldIdxIn for sub2 is the one ending
+        // at 1008r, Next is the one starting at 1040r. OldIdxIn cannot be
+        // extended to Next, since 1040r does not actually read sub2. Instead,
+        // we need to find the last use between the previous def of sub2 and
+        // the beginning of the next segment.
+        //
+        OldIdxIn->end = findLastUseBefore(OldIdxIn->start,
+                                          Next->start.getDeadSlot(),
+                                          Reg, LaneMask);
         return;
       }
 
@@ -1194,7 +1218,8 @@
       SlotIndex DefBeforeOldIdx
         = std::max(OldIdxIn->start.getDeadSlot(),
                    NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber()));
-      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask);
+      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, OldIdx,
+                                        Reg, LaneMask);
 
       // Did we have a Def at OldIdx? If not we are done now.
       OldIdxOut = std::next(OldIdxIn);
@@ -1305,38 +1330,41 @@
            "Cannot move regmask instruction below another call");
   }
 
-  // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(SlotIndex Before, unsigned Reg,
-                              LaneBitmask LaneMask) {
+  // Return the last use of reg between Min and Max.
+  SlotIndex findLastUseBefore(SlotIndex Min, SlotIndex Max,
+                              unsigned Reg, LaneBitmask LaneMask) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      SlotIndex LastUse = Before;
-      for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
-        if (MO.isUndef())
+      SlotIndex LastUse = Min;
+      for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
+        if (!MO.readsReg())
           continue;
         unsigned SubReg = MO.getSubReg();
-        if (SubReg != 0 && LaneMask != 0
-            && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask) == 0)
-          continue;
+        if (SubReg != 0 && LaneMask != 0) {
+          LaneBitmask SLM = TRI.getSubRegIndexLaneMask(SubReg);
+          if (MO.isDef())
+            SLM = ~SLM & MRI.getMaxLaneMaskForVReg(Reg);
+          if ((LaneMask & SLM) == 0)
+            continue;
+        }
 
         const MachineInstr &MI = *MO.getParent();
         SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
-        if (InstSlot > LastUse && InstSlot < OldIdx)
+        if (InstSlot > LastUse && InstSlot < Max)
           LastUse = InstSlot.getRegSlot();
       }
       return LastUse;
     }
 
     // This is a regunit interval, so scanning the use list could be very
-    // expensive. Scan upwards from OldIdx instead.
-    assert(Before < OldIdx && "Expected upwards move");
+    // expensive. Scan upwards from Max instead.
     SlotIndexes *Indexes = LIS.getSlotIndexes();
-    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Before);
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Min);
 
-    // OldIdx may not correspond to an instruction any longer, so set MII to
+    // Max may not correspond to an instruction any longer, so set MII to
     // point to the next instruction after OldIdx, or MBB->end().
     MachineBasicBlock::iterator MII = MBB->end();
     if (MachineInstr *MI = Indexes->getInstructionFromIndex(
-                           Indexes->getNextNonNullIndex(OldIdx)))
+                           Indexes->getNextNonNullIndex(Max)))
       if (MI->getParent() == MBB)
         MII = MI;
 
@@ -1346,9 +1374,9 @@
         continue;
       SlotIndex Idx = Indexes->getInstructionIndex(*MII);
 
-      // Stop searching when Before is reached.
-      if (!SlotIndex::isEarlierInstr(Before, Idx))
-        return Before;
+      // Stop searching when Min is reached.
+      if (!SlotIndex::isEarlierInstr(Min, Idx))
+        return Min;
 
       // Check if MII uses Reg.
       for (MIBundleOperands MO(*MII); MO.isValid(); ++MO)
@@ -1357,8 +1385,8 @@
             TRI.hasRegUnit(MO->getReg(), Reg))
           return Idx.getRegSlot();
     }
-    // Didn't reach Before. It must be the first instruction in the block.
-    return Before;
+    // Didn't reach Min. It must be the first instruction in the block.
+    return Min;
   }
 };
 
Index: test/CodeGen/AMDGPU/scheduler-liveness-1.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/scheduler-liveness-1.ll
@@ -0,0 +1,85 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; Check for a sane output. This testcase used to crash.
+; CHECK: image_sample_c
+
+target triple = "amdgcn--"
+
+define amdgpu_ps void @main() #0 {
+main_body:
+  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %1 = load <4 x i32>, <4 x i32> addrspace(2)* null, align 16, !invariant.load !0
+  %2 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> zeroinitializer, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = fmul float %3, 2.000000e+00
+  %5 = fadd float %4, -1.000000e+00
+  %6 = fmul float %5, undef
+  %7 = fadd float %6, undef
+  %8 = call float @llvm.AMDGPU.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
+  %9 = fmul float %8, 0.000000e+00
+  %10 = fadd float undef, %9
+  %11 = fmul float %0, undef
+  %12 = fadd float %11, undef
+  %13 = fadd float undef, undef
+  %14 = bitcast float %12 to i32
+  %15 = bitcast float %13 to i32
+  %16 = insertelement <4 x i32> undef, i32 %14, i32 0
+  %17 = insertelement <4 x i32> %16, i32 %15, i32 1
+  %18 = insertelement <4 x i32> %17, i32 undef, i32 2
+  %19 = bitcast float %12 to i32
+  %20 = insertelement <4 x i32> undef, i32 %19, i32 0
+  %21 = insertelement <4 x i32> %20, i32 0, i32 1
+  %22 = insertelement <4 x i32> %21, i32 0, i32 2
+  %23 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %22, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %24 = extractelement <4 x float> %23, i32 0
+  %25 = fadd float undef, %24
+  %26 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> zeroinitializer, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %27 = extractelement <4 x float> %26, i32 0
+  %28 = fadd float %25, %27
+  %29 = fadd float %28, undef
+  %30 = fadd float undef, undef
+  %31 = bitcast float %12 to i32
+  %32 = bitcast float %30 to i32
+  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
+  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
+  %35 = insertelement <4 x i32> %34, i32 undef, i32 2
+  %36 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %35, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = fadd float %29, %37
+  %39 = fmul float %38, 1.250000e-01
+  %40 = fmul float %39, 8.000000e+00
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP, %main_body
+  %.038 = phi float [ 0x36C0000000000000, %main_body ], [ 0.000000e+00, %LOOP ]
+  %.5 = phi float [ %40, %main_body ], [ undef, %LOOP ]
+  %41 = bitcast float %.038 to i32
+  %42 = icmp sgt i32 %41, 15
+  br i1 %42, label %IF29, label %LOOP
+
+IF29:                                             ; preds = %LOOP
+  %43 = fmul float %.5, 3.125000e-02
+  %44 = fadd float %43, undef
+  %45 = call float @llvm.AMDGPU.clamp.(float %44, float 0.000000e+00, float 1.000000e+00)
+  %46 = fmul float %10, %45
+  %47 = fmul float %46, undef
+  %48 = fadd float %47, undef
+  %49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %48, 11
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.AMDGPU.clamp.(float, float, float) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{}
Index: test/CodeGen/AMDGPU/scheduler-liveness-2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/scheduler-liveness-2.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; Check for a sane output. This testcase used to crash.
+; CHECK: tbuffer_store_format_x
+
+target triple = "amdgcn--"
+
+define amdgpu_gs void @main(i32 inreg) {
+main_body:
+  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 36)
+  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 40)
+  %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float undef, i32 1
+  %array_vector22 = insertelement <4 x float> %array_vector21, float %1, i32 2
+  %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
+  %array_vector25 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float undef, i32 1
+  %array_vector26 = insertelement <4 x float> %array_vector25, float %2, i32 2
+  %array_vector27 = insertelement <4 x float> %array_vector26, float 0.000000e+00, i32 3
+  %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
+  %3 = extractelement <4 x i32> %bc52, i32 undef
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %3, i32 1, i32 64, i32 %0, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc53 = bitcast <4 x float> %array_vector27 to <4 x i32>
+  %4 = extractelement <4 x i32> %bc53, i32 undef
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %4, i32 1, i32 76, i32 %0, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }