Index: lib/CodeGen/LiveIntervals.cpp
===================================================================
--- lib/CodeGen/LiveIntervals.cpp
+++ lib/CodeGen/LiveIntervals.cpp
@@ -1296,6 +1296,36 @@
           if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdx, OldIdxIn->end))
             OldIdxIn->end = NewIdx.getRegSlot();
         }
+      } else if (OldIdxIn != E
+          && SlotIndex::isEarlierInstr(NewIdxOut->start, NewIdx)
+          && SlotIndex::isEarlierInstr(NewIdx, NewIdxOut->end)) {
+        // OldIdxVNI is a dead def that has been moved into the middle of
+        // another value in LR. That can happen when LR is a whole register,
+        // but the dead def is a write to a subreg that is dead at NewIdx.
+        // The dead def may have been moved across other values
+        // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut)
+        // down one position.
+        //    |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - |
+        // => |- X0/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -|
+        std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut));
+        // Modify the segment at NewIdxOut and the following segment to meet at
+        // the point of the dead def, with the following segment getting
+        // OldIdxVNI as its value number.
+        *NewIdxOut = LiveRange::Segment(
+            NewIdxOut->start, NewIdxDef.getRegSlot(), NewIdxOut->valno);
+        *(NewIdxOut + 1) = LiveRange::Segment(
+            NewIdxDef.getRegSlot(), (NewIdxOut + 1)->end, OldIdxVNI);
+        OldIdxVNI->def = NewIdxDef;
+        // Modify subsequent segments to be defined by the moved def OldIdxVNI.
+        for (auto Idx = NewIdxOut + 2; Idx <= OldIdxOut; ++Idx)
+          Idx->valno = OldIdxVNI;
+        // Aggressively remove all dead flags from the former dead definition.
+        // Kill/dead flags shouldn't be used while live intervals exist; they
+        // will be reinserted by VirtRegRewriter.
+        if (MachineInstr *KillMI = LIS.getInstructionFromIndex(NewIdx))
+          for (MIBundleOperands MO(*KillMI); MO.isValid(); ++MO)
+            if (MO->isReg() && !MO->isUse())
+              MO->setIsDead(false);
       } else {
         // OldIdxVNI is a dead def. It may have been moved across other values
         // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut)
Index: test/CodeGen/AMDGPU/machine-scheduler-move-dead-def-up.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/machine-scheduler-move-dead-def-up.ll
@@ -0,0 +1,274 @@
+; RUN: llc -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; This bugpoint cutdown would assert in machine scheduling when trying to move a
+; dead def of a subreg up into the middle of a segment.
+
+; GCN-LABEL: {{^}}_amdgpu_cs_main:
+
+source_filename = "../test/CodeGen/AMDGPU/machine-scheduler-move-dead-def-up.ll"
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.floor.f32(float) #0
+
+; Function Attrs: nounwind
+define dllexport amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %arg, <3 x i32> %arg1) local_unnamed_addr #1 {
+.entry:
+  %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract.rhs = extractelement <3 x i32> %arg1, i32 0
+  %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract = add i32 0, %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract.rhs
+  br i1 undef, label %bb, label %bb2
+
+bb:                                               ; preds = %.entry
+  br label %bb2
+
+bb2:                                              ; preds = %bb, %.entry
+  br i1 undef, label %bb3, label %"SimulateParticle(i1;f1;.exit"
+
+bb3:                                              ; preds = %bb2
+  br i1 undef, label %bb5, label %bb4
+
+bb4:                                              ; preds = %bb3
+  br label %bb5
+
+bb5:                                              ; preds = %bb4, %bb3
+  br i1 undef, label %"SimulateParticle(i1;f1;.exit", label %bb6
+
+bb6:                                              ; preds = %bb5
+  br i1 undef, label %"SimulateParticle(i1;f1;.exit", label %bb7
+
+bb7:                                              ; preds = %bb6
+  br i1 undef, label %"SimulateParticle(i1;f1;.exit", label %bb8
+
+bb8:                                              ; preds = %bb7
+  br i1 undef, label %._crit_edge, label %bb9
+
+._crit_edge:                                      ; preds = %bb8
+  br label %bb32
+
+bb9:                                              ; preds = %bb8
+  switch i32 undef, label %bb10 [
+    i32 0, label %bb11
+    i32 1, label %bb12
+    i32 2, label %bb13
+    i32 3, label %bb18
+    i32 4, label %bb19
+    i32 5, label %bb20
+  ]
+
+bb10:                                             ; preds = %bb9
+  br label %"Distribute(i1;vf4;.exit.i"
+
+bb11:                                             ; preds = %bb9
+  br label %"Distribute(i1;vf4;.exit.i"
+
+bb12:                                             ; preds = %bb9
+  br label %"Distribute(i1;vf4;.exit.i"
+
+bb13:                                             ; preds = %bb9
+  br i1 undef, label %bb14, label %bb15
+
+bb14:                                             ; preds = %bb13
+  br label %"Distribute(i1;vf4;.exit.i"
+
+bb15:                                             ; preds = %bb13
+  br i1 undef, label %bb16, label %bb17
+
+bb16:                                             ; preds = %bb15
+  br label %"Distribute(i1;vf4;.exit.i"
+
+bb17:                                             ; preds = %bb15
+  br label %"Distribute(i1;vf4;.exit.i"
+
+bb18:                                             ; preds = %bb9
+  br label %"Distribute(i1;vf4;.exit.i"
+
+bb19:                                             ; preds = %bb9
+  br label %"Distribute(i1;vf4;.exit.i"
+
+bb20:                                             ; preds = %bb9
+  br label %"Distribute(i1;vf4;.exit.i"
+
+"Distribute(i1;vf4;.exit.i":                      ; preds = %bb20, %bb19, %bb18, %bb17, %bb16, %bb14, %bb12, %bb11, %bb10
+  switch i32 undef, label %bb21 [
+    i32 0, label %bb22
+    i32 1, label %bb23
+    i32 2, label %bb24
+    i32 3, label %bb29
+    i32 4, label %bb30
+    i32 5, label %bb31
+  ]
+
+bb21:                                             ; preds = %"Distribute(i1;vf4;.exit.i"
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+bb22:                                             ; preds = %"Distribute(i1;vf4;.exit.i"
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+bb23:                                             ; preds = %"Distribute(i1;vf4;.exit.i"
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+bb24:                                             ; preds = %"Distribute(i1;vf4;.exit.i"
+  br i1 undef, label %bb25, label %bb26
+
+bb25:                                             ; preds = %bb24
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+bb26:                                             ; preds = %bb24
+  br i1 undef, label %bb27, label %bb28
+
+bb27:                                             ; preds = %bb26
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+bb28:                                             ; preds = %bb26
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+bb29:                                             ; preds = %"Distribute(i1;vf4;.exit.i"
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+bb30:                                             ; preds = %"Distribute(i1;vf4;.exit.i"
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+bb31:                                             ; preds = %"Distribute(i1;vf4;.exit.i"
+  br label %"EmitParticle(i1;i1;f1;.exit"
+
+"EmitParticle(i1;i1;f1;.exit":                    ; preds = %bb31, %bb30, %bb29, %bb28, %bb27, %bb25, %bb23, %bb22, %bb21
+  br label %bb32
+
+bb32:                                             ; preds = %"EmitParticle(i1;i1;f1;.exit", %._crit_edge
+  br i1 undef, label %"SimulateParticle(i1;f1;.exit", label %bb33
+
+bb33:                                             ; preds = %bb32
+  %tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %arg, i32 0, i1 false) #1
+  %tmp34 = bitcast <4 x i32> %tmp to <4 x float>
+  %tmp35 = shl i32 %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract, 4
+  %tmp36 = and i32 %tmp35, 131056
+  %tmp37 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> undef, i32 %tmp36, i1 false) #1
+  %tmp38 = bitcast <4 x i32> %tmp37 to <4 x float>
+  %tmp39 = fadd reassoc nnan arcp contract <4 x float> %tmp34, %tmp38
+  %tmp40 = fmul reassoc nnan arcp contract float undef, 0x3F20000000000000
+  %tmp41 = tail call float @llvm.floor.f32(float %tmp40) #1
+  %tmp42 = fmul reassoc nnan arcp contract float %tmp41, 0x3FBF972480000000
+  %tmp43 = insertelement <4 x float> undef, float %tmp42, i32 0
+  %tmp44 = shufflevector <4 x float> %tmp43, <4 x float> undef, <4 x i32> zeroinitializer
+  %tmp45 = fadd reassoc nnan arcp contract <4 x float> %tmp44, %tmp39
+  %x0.i41 = extractelement <4 x float> %tmp45, i32 0
+  %tmp46 = call float @llvm.amdgcn.fract.f32(float %x0.i41) #1
+  %tmp47 = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1
+  %tmp48 = shufflevector <4 x float> %tmp47, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  br i1 undef, label %bb49, label %bb54
+
+bb49:                                             ; preds = %bb33
+  br i1 undef, label %bb63, label %bb50
+
+bb50:                                             ; preds = %bb49
+  br i1 undef, label %bb51, label %bb63
+
+bb51:                                             ; preds = %bb50
+  br i1 undef, label %bb52, label %bb53
+
+bb52:                                             ; preds = %bb51
+  br label %bb53
+
+bb53:                                             ; preds = %bb52, %bb51
+  br label %bb63
+
+bb54:                                             ; preds = %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618", %bb33
+  %velocity.i.0 = phi <3 x float> [ %tmp83, %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618" ], [ %tmp48, %bb33 ]
+  br i1 undef, label %bb55, label %.critedge
+
+bb55:                                             ; preds = %bb54
+  br i1 undef, label %bb56, label %.critedge
+
+bb56:                                             ; preds = %bb55
+  br i1 undef, label %"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i", label %.critedge
+
+"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i":          ; preds = %bb56
+  br i1 undef, label %bb57, label %.critedge
+
+bb57:                                             ; preds = %"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i"
+  br i1 undef, label %"OctWrap(vf2;.exit.i.i", label %"NormalOctDecode(vf2;b1;.exit.i"
+
+"OctWrap(vf2;.exit.i.i":                          ; preds = %bb57
+  br label %"NormalOctDecode(vf2;b1;.exit.i"
+
+"NormalOctDecode(vf2;b1;.exit.i":                 ; preds = %"OctWrap(vf2;.exit.i.i", %bb57
+  %end.i120.i = fsub <3 x float> %tmp48, undef
+  %scale50.i = fmul <3 x float> undef, %end.i120.i
+  %tmp58 = fsub reassoc nnan arcp contract <3 x float> %scale50.i, undef
+  br i1 undef, label %bb59, label %.critedge
+
+bb59:                                             ; preds = %"NormalOctDecode(vf2;b1;.exit.i"
+  br i1 undef, label %bb60, label %.critedge
+
+bb60:                                             ; preds = %bb59
+  br label %.critedge
+
+.critedge:                                        ; preds = %bb60, %bb59, %"NormalOctDecode(vf2;b1;.exit.i", %"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i", %bb56, %bb55, %bb54
+  %velocity.i.1 = phi <3 x float> [ %tmp58, %bb60 ], [ %tmp58, %bb59 ], [ %velocity.i.0, %"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i" ], [ %velocity.i.0, %bb55 ], [ %velocity.i.0, %bb56 ], [ %velocity.i.0, %bb54 ], [ %tmp58, %"NormalOctDecode(vf2;b1;.exit.i" ]
+  %tmp61 = shufflevector <3 x float> %velocity.i.1, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %tmp61, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1
+  br i1 undef, label %bb62, label %"SimulateParticle(i1;f1;.exit"
+
+bb62:                                             ; preds = %.critedge
+  br label %"SimulateParticle(i1;f1;.exit"
+
+"SimulateParticle(i1;f1;.exit":                   ; preds = %bb62, %.critedge, %bb32, %bb7, %bb6, %bb5, %bb2
+  ret void
+
+bb63:                                             ; preds = %bb53, %bb50, %bb49
+  %param6.i.1 = phi <3 x float> [ undef, %bb53 ], [ %tmp48, %bb50 ], [ %tmp48, %bb49 ]
+  br i1 undef, label %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618", label %bb64
+
+bb64:                                             ; preds = %bb63
+  br i1 undef, label %bb65, label %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618"
+
+bb65:                                             ; preds = %bb64
+  br i1 undef, label %bb66, label %bb67
+
+bb66:                                             ; preds = %bb65
+  br label %bb67
+
+bb67:                                             ; preds = %bb66, %bb65
+  br label %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618"
+
+"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618":  ; preds = %bb67, %bb64, %bb63
+  %param6.i.1.1 = phi <3 x float> [ zeroinitializer, %bb67 ], [ %param6.i.1, %bb64 ], [ %param6.i.1, %bb63 ]
+  %tmp68 = fmul reassoc nnan arcp contract float %tmp46, 2.000000e+00
+  %tmp69 = fadd reassoc nnan arcp contract float %tmp68, -1.000000e+00
+  %tmp70 = fmul reassoc nnan arcp contract float %tmp69, undef
+  %tmp71 = fadd reassoc nnan arcp contract float %tmp70, undef
+  %tmp72 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %arg, i32 144, i1 false) #1
+  %tmp73 = bitcast <4 x i32> %tmp72 to <4 x float>
+  %tmp74 = shufflevector <4 x float> %tmp73, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %tmp75 = fdiv float 1.000000e+00, %tmp71
+  %tmp76 = fmul float undef, %tmp75
+  %tmp77 = fmul float undef, %tmp75
+  %tmp78 = fmul float undef, %tmp75
+  %tmp79 = insertelement <3 x float> undef, float %tmp76, i32 0
+  %tmp80 = insertelement <3 x float> %tmp79, float %tmp77, i32 1
+  %tmp81 = insertelement <3 x float> %tmp80, float %tmp78, i32 2
+  %tmp82 = fadd reassoc nnan arcp contract <3 x float> %tmp74, %tmp81
+  %scale.i = fmul <3 x float> undef, %tmp82
+  %tmp83 = fadd reassoc nnan arcp contract <3 x float> %param6.i.1.1, %scale.i
+  br label %bb54
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.amdgcn.fract.f32(float) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i1) #3
+
+; Function Attrs: nounwind writeonly
+declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #4
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind readnone }
+attributes #4 = { nounwind writeonly }
+