Index: lib/CodeGen/LiveIntervalAnalysis.cpp
===================================================================
--- lib/CodeGen/LiveIntervalAnalysis.cpp
+++ lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -948,8 +948,12 @@
         LiveInterval &LI = LIS.getInterval(Reg);
         if (LI.hasSubRanges()) {
           unsigned SubReg = MO.getSubReg();
+          LaneBitmask WholeMask = MRI.getMaxLaneMaskForVReg(Reg);
           LaneBitmask LaneMask = SubReg ? TRI.getSubRegIndexLaneMask(SubReg)
-                                        : MRI.getMaxLaneMaskForVReg(Reg);
+                                        : WholeMask;
+          // A non-undef subreg def reads other lanes.
+          if (SubReg != 0 && MO.isDef() && !MO.isUndef())
+            LaneMask = WholeMask;
           for (LiveInterval::SubRange &S : LI.subranges()) {
             if ((S.LaneMask & LaneMask) == 0)
               continue;
@@ -988,7 +992,7 @@
       dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
-      handleMoveDown(LR);
+      handleMoveDown(LR, Reg, LaneMask);
     else
       handleMoveUp(LR, Reg, LaneMask);
     DEBUG(dbgs() << "        -->\t" << LR << '\n');
@@ -997,7 +1001,7 @@
 
   /// Update LR to reflect an instruction has been moved downwards from OldIdx
   /// to NewIdx (OldIdx < NewIdx).
-  void handleMoveDown(LiveRange &LR) {
+  void handleMoveDown(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
     LiveRange::iterator E = LR.end();
     // Segment going into OldIdx.
     LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
@@ -1034,8 +1038,28 @@
           LiveRange::iterator Prev = std::prev(NewIdxIn);
           Prev->end = NewIdx.getRegSlot();
         }
-        // Extend OldIdxIn.
-        OldIdxIn->end = Next->start;
+        // Terminate OldIdxIn. The new end of that segment is not necessarily
+        // directly adjacent to the start of the new one. Take this case for
+        // example:
+        //   1008B    %vreg131:sub1<def> = COPY %vreg54
+        //   1040B    %vreg131:sub2<def> = COPY %vreg23
+        //   1072B    %vreg131:sub3<def> = COPY %vreg24
+        //   1104B    %vreg131:sub4<def> = COPY %vreg25
+        //   1200B    ... = COPY vreg131
+        // The live subrange for vreg131:sub2 will contain
+        //   [...,1008r)[1040r,1200r)
+        //
+        // Consider a move of 1008 to after 1072. This will move the def of
+        // sub1, but the range for other subregisters (including sub2) will
+        // need to be updated as well. The OldIdxIn for sub2 is the one ending
+        // at 1008r, Next is the one starting at 1040r. OldIdxIn cannot be
+        // extended to Next, since 1040r does not actually read sub2. Instead,
+        // we need to find the last use between the previous def of sub2 and
+        // the beginning of the next segment.
+        //
+        OldIdxIn->end = findLastUseBefore(OldIdxIn->start,
+                                          Next->start.getDeadSlot(),
+                                          Reg, LaneMask);
         return;
       }
 
@@ -1194,7 +1218,8 @@
       SlotIndex DefBeforeOldIdx
         = std::max(OldIdxIn->start.getDeadSlot(),
                    NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber()));
-      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask);
+      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, OldIdx,
+                                        Reg, LaneMask);
 
       // Did we have a Def at OldIdx? If not we are done now.
       OldIdxOut = std::next(OldIdxIn);
@@ -1305,38 +1330,41 @@
            "Cannot move regmask instruction below another call");
   }
 
-  // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(SlotIndex Before, unsigned Reg,
-                              LaneBitmask LaneMask) {
+  // Return the last use of reg between Min and Max.
+  SlotIndex findLastUseBefore(SlotIndex Min, SlotIndex Max,
+                              unsigned Reg, LaneBitmask LaneMask) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      SlotIndex LastUse = Before;
-      for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
-        if (MO.isUndef())
+      SlotIndex LastUse = Min;
+      for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
+        if (!MO.readsReg())
           continue;
         unsigned SubReg = MO.getSubReg();
-        if (SubReg != 0 && LaneMask != 0
-            && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask) == 0)
-          continue;
+        if (SubReg != 0 && LaneMask != 0) {
+          LaneBitmask SLM = TRI.getSubRegIndexLaneMask(SubReg);
+          if (MO.isDef())
+            SLM = ~SLM & MRI.getMaxLaneMaskForVReg(Reg);
+          if ((LaneMask & SLM) == 0)
+            continue;
+        }
 
         const MachineInstr &MI = *MO.getParent();
         SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
-        if (InstSlot > LastUse && InstSlot < OldIdx)
+        if (InstSlot > LastUse && InstSlot < Max)
           LastUse = InstSlot.getRegSlot();
       }
       return LastUse;
     }
 
     // This is a regunit interval, so scanning the use list could be very
-    // expensive. Scan upwards from OldIdx instead.
-    assert(Before < OldIdx && "Expected upwards move");
+    // expensive. Scan upwards from Max instead.
     SlotIndexes *Indexes = LIS.getSlotIndexes();
-    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Before);
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Min);
 
-    // OldIdx may not correspond to an instruction any longer, so set MII to
+    // Max may not correspond to an instruction any longer, so set MII to
     // point to the next instruction after OldIdx, or MBB->end().
     MachineBasicBlock::iterator MII = MBB->end();
     if (MachineInstr *MI = Indexes->getInstructionFromIndex(
-                           Indexes->getNextNonNullIndex(OldIdx)))
+                           Indexes->getNextNonNullIndex(Max)))
       if (MI->getParent() == MBB)
         MII = MI;
 
@@ -1346,9 +1374,9 @@
         continue;
       SlotIndex Idx = Indexes->getInstructionIndex(*MII);
 
-      // Stop searching when Before is reached.
-      if (!SlotIndex::isEarlierInstr(Before, Idx))
-        return Before;
+      // Stop searching when Min is reached.
+      if (!SlotIndex::isEarlierInstr(Min, Idx))
+        return Min;
 
       // Check if MII uses Reg.
       for (MIBundleOperands MO(*MII); MO.isValid(); ++MO)
@@ -1357,8 +1385,8 @@
             TRI.hasRegUnit(MO->getReg(), Reg))
           return Idx.getRegSlot();
     }
-    // Didn't reach Before. It must be the first instruction in the block.
-    return Before;
+    // Didn't reach Min. It must be the first instruction in the block.
+    return Min;
   }
 };
 
Index: test/CodeGen/AMDGPU/gs-array-copy.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/gs-array-copy.ll
@@ -0,0 +1,172 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+
+; Check for a sane output instead of a crash.
+; CHECK: s_endpgm
+
+target triple = "amdgcn--"
+
+define amdgpu_gs void @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
+main_body:
+  %15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
+  %16 = load <16 x i8>, <16 x i8> addrspace(2)* %15, align 16, !invariant.load !0
+  %17 = call float @llvm.SI.load.const(<16 x i8> %16, i32 0)
+  %18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 1, !amdgpu.uniform !0
+  %19 = load <16 x i8>, <16 x i8> addrspace(2)* %18, align 16, !invariant.load !0
+  %20 = call float @llvm.SI.load.const(<16 x i8> %19, i32 16)
+  %21 = call float @llvm.SI.load.const(<16 x i8> %19, i32 20)
+  %22 = call float @llvm.SI.load.const(<16 x i8> %19, i32 24)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %19, i32 28)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %19, i32 32)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %19, i32 36)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %19, i32 40)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %19, i32 44)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %19, i32 48)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %19, i32 52)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %19, i32 56)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %19, i32 60)
+  %32 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i64 3, !amdgpu.uniform !0
+  %33 = load <16 x i8>, <16 x i8> addrspace(2)* %32, align 16, !invariant.load !0
+  %34 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i64 4, !amdgpu.uniform !0
+  %35 = load <16 x i8>, <16 x i8> addrspace(2)* %34, align 16, !invariant.load !0
+  %36 = bitcast float %17 to i32
+  %array_vector1 = insertelement <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, float %20, i32 1
+  %array_vector2 = insertelement <4 x float> %array_vector1, float %24, i32 2
+  %array_vector3 = insertelement <4 x float> %array_vector2, float %28, i32 3
+  %array_vector5 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %21, i32 1
+  %array_vector6 = insertelement <4 x float> %array_vector5, float %25, i32 2
+  %array_vector7 = insertelement <4 x float> %array_vector6, float %29, i32 3
+  %array_vector9 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %22, i32 1
+  %array_vector10 = insertelement <4 x float> %array_vector9, float %26, i32 2
+  %array_vector11 = insertelement <4 x float> %array_vector10, float %30, i32 3
+  %array_vector13 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %23, i32 1
+  %array_vector14 = insertelement <4 x float> %array_vector13, float %27, i32 2
+  %array_vector15 = insertelement <4 x float> %array_vector14, float %31, i32 3
+  %37 = shl i32 %7, 2
+  %38 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %37, i32 4096, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %39 = shl i32 %7, 2
+  %40 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %39, i32 4352, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %41 = shl i32 %7, 2
+  %42 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %41, i32 4608, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %43 = shl i32 %7, 2
+  %44 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %43, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  call void @llvm.AMDGPU.kill(float 1.000000e+00)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %38, i32 1, i32 0, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %40, i32 1, i32 12, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %42, i32 1, i32 24, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %44, i32 1, i32 36, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
+  %45 = extractelement <4 x i32> %bc, i32 %36
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %45, i32 1, i32 48, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc48 = bitcast <4 x float> %array_vector7 to <4 x i32>
+  %46 = extractelement <4 x i32> %bc48, i32 %36
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %46, i32 1, i32 60, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32>
+  %47 = extractelement <4 x i32> %bc49, i32 %36
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %47, i32 1, i32 72, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc50 = bitcast <4 x float> %array_vector15 to <4 x i32>
+  %48 = extractelement <4 x i32> %bc50, i32 %36
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %48, i32 1, i32 84, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.sendmsg(i32 34, i32 %6)
+  %49 = bitcast float %17 to i32
+  %array_vector17 = insertelement <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, float %20, i32 1
+  %array_vector18 = insertelement <4 x float> %array_vector17, float %24, i32 2
+  %array_vector19 = insertelement <4 x float> %array_vector18, float %28, i32 3
+  %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %21, i32 1
+  %array_vector22 = insertelement <4 x float> %array_vector21, float %25, i32 2
+  %array_vector23 = insertelement <4 x float> %array_vector22, float %29, i32 3
+  %array_vector25 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %22, i32 1
+  %array_vector26 = insertelement <4 x float> %array_vector25, float %26, i32 2
+  %array_vector27 = insertelement <4 x float> %array_vector26, float %30, i32 3
+  %array_vector29 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %23, i32 1
+  %array_vector30 = insertelement <4 x float> %array_vector29, float %27, i32 2
+  %array_vector31 = insertelement <4 x float> %array_vector30, float %31, i32 3
+  %50 = shl i32 %8, 2
+  %51 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %50, i32 4096, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %52 = shl i32 %8, 2
+  %53 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %52, i32 4352, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %54 = shl i32 %8, 2
+  %55 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %54, i32 4608, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %56 = shl i32 %8, 2
+  %57 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %56, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  call void @llvm.AMDGPU.kill(float 1.000000e+00)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %51, i32 1, i32 4, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %53, i32 1, i32 16, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %55, i32 1, i32 28, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %57, i32 1, i32 40, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc51 = bitcast <4 x float> %array_vector19 to <4 x i32>
+  %58 = extractelement <4 x i32> %bc51, i32 %49
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %58, i32 1, i32 52, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
+  %59 = extractelement <4 x i32> %bc52, i32 %49
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %59, i32 1, i32 64, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc53 = bitcast <4 x float> %array_vector27 to <4 x i32>
+  %60 = extractelement <4 x i32> %bc53, i32 %49
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %60, i32 1, i32 76, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc54 = bitcast <4 x float> %array_vector31 to <4 x i32>
+  %61 = extractelement <4 x i32> %bc54, i32 %49
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %61, i32 1, i32 88, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.sendmsg(i32 34, i32 %6)
+  %62 = bitcast float %17 to i32
+  %array_vector33 = insertelement <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, float %20, i32 1
+  %array_vector34 = insertelement <4 x float> %array_vector33, float %24, i32 2
+  %array_vector35 = insertelement <4 x float> %array_vector34, float %28, i32 3
+  %array_vector37 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %21, i32 1
+  %array_vector38 = insertelement <4 x float> %array_vector37, float %25, i32 2
+  %array_vector39 = insertelement <4 x float> %array_vector38, float %29, i32 3
+  %array_vector41 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %22, i32 1
+  %array_vector42 = insertelement <4 x float> %array_vector41, float %26, i32 2
+  %array_vector43 = insertelement <4 x float> %array_vector42, float %30, i32 3
+  %array_vector45 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %23, i32 1
+  %array_vector46 = insertelement <4 x float> %array_vector45, float %27, i32 2
+  %array_vector47 = insertelement <4 x float> %array_vector46, float %31, i32 3
+  %63 = shl i32 %10, 2
+  %64 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %63, i32 4096, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %65 = shl i32 %10, 2
+  %66 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %65, i32 4352, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %67 = shl i32 %10, 2
+  %68 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %67, i32 4608, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %69 = shl i32 %10, 2
+  %70 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %33, i32 %69, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  call void @llvm.AMDGPU.kill(float 1.000000e+00)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %64, i32 1, i32 8, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %66, i32 1, i32 20, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %68, i32 1, i32 32, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %70, i32 1, i32 44, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc55 = bitcast <4 x float> %array_vector35 to <4 x i32>
+  %71 = extractelement <4 x i32> %bc55, i32 %62
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %71, i32 1, i32 56, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc56 = bitcast <4 x float> %array_vector39 to <4 x i32>
+  %72 = extractelement <4 x i32> %bc56, i32 %62
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %72, i32 1, i32 68, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc57 = bitcast <4 x float> %array_vector43 to <4 x i32>
+  %73 = extractelement <4 x i32> %bc57, i32 %62
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %73, i32 1, i32 80, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc58 = bitcast <4 x float> %array_vector47 to <4 x i32>
+  %74 = extractelement <4 x i32> %bc58, i32 %62
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %35, i32 %74, i32 1, i32 92, i32 %5, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.sendmsg(i32 34, i32 %6)
+  call void @llvm.SI.sendmsg(i32 3, i32 %6)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind
+declare void @llvm.AMDGPU.kill(float) #2
+
+; Function Attrs: nounwind
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #2
+
+; Function Attrs: nounwind
+declare void @llvm.SI.sendmsg(i32, i32) #2
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind }
+
+!0 = !{}
+
Index: test/CodeGen/AMDGPU/tex-miplevel-selection-scheduler.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/tex-miplevel-selection-scheduler.ll
@@ -0,0 +1,102 @@
+; RUN: llc -march=amdgcn -join-liveintervals=0 < %s | FileCheck %s
+
+; Check for a sane output instead of a crash.
+; CHECK: image_sample_c_d_o
+
+target triple = "amdgcn--"
+
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
+main_body:
+  %23 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
+  %24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !invariant.load !0
+  %25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %24, i32 48)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %24, i32 52)
+  %29 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
+  %30 = load <8 x i32>, <8 x i32> addrspace(2)* %29, align 32, !invariant.load !0
+  %31 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
+  %32 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %31, i64 0, i64 3, !amdgpu.uniform !0
+  %33 = load <4 x i32>, <4 x i32> addrspace(2)* %32, align 16, !invariant.load !0
+  %34 = extractelement <8 x i32> %30, i32 7
+  %35 = extractelement <4 x i32> %33, i32 0
+  %36 = and i32 %35, %34
+  %37 = insertelement <4 x i32> %33, i32 %36, i32 0
+  %38 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 2, !amdgpu.uniform !0
+  %39 = load <8 x i32>, <8 x i32> addrspace(2)* %38, align 32, !invariant.load !0
+  %40 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
+  %41 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %40, i64 0, i64 7, !amdgpu.uniform !0
+  %42 = load <4 x i32>, <4 x i32> addrspace(2)* %41, align 16, !invariant.load !0
+  %43 = extractelement <8 x i32> %39, i32 7
+  %44 = extractelement <4 x i32> %42, i32 0
+  %45 = and i32 %44, %43
+  %46 = insertelement <4 x i32> %42, i32 %45, i32 0
+  %47 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %8)
+  %48 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %8)
+  %49 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %6, <2 x i32> %8)
+  %50 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %6, <2 x i32> %8)
+  %51 = fadd float %50, 0xBFA99999A0000000
+  %52 = fadd float %47, 0.000000e+00
+  %53 = fadd float %48, 0.000000e+00
+  %54 = fadd float %49, 0.000000e+00
+  %55 = fadd float %50, 0x3FA99999A0000000
+  %56 = bitcast float %51 to i32
+  %57 = bitcast float %25 to i32
+  %58 = bitcast float %26 to i32
+  %59 = bitcast float %27 to i32
+  %60 = bitcast float %28 to i32
+  %61 = bitcast float %47 to i32
+  %62 = bitcast float %48 to i32
+  %63 = bitcast float %49 to i32
+  %64 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %56, i32 1
+  %65 = insertelement <16 x i32> %64, i32 %57, i32 2
+  %66 = insertelement <16 x i32> %65, i32 %58, i32 3
+  %67 = insertelement <16 x i32> %66, i32 %59, i32 4
+  %68 = insertelement <16 x i32> %67, i32 %60, i32 5
+  %69 = insertelement <16 x i32> %68, i32 %61, i32 6
+  %70 = insertelement <16 x i32> %69, i32 %62, i32 7
+  %71 = insertelement <16 x i32> %70, i32 %63, i32 8
+  %72 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %71, <8 x i32> %30, <4 x i32> %37, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %73 = extractelement <4 x float> %72, i32 0
+  %74 = bitcast float %55 to i32
+  %75 = bitcast float %25 to i32
+  %76 = bitcast float %26 to i32
+  %77 = bitcast float %27 to i32
+  %78 = bitcast float %28 to i32
+  %79 = bitcast float %52 to i32
+  %80 = bitcast float %53 to i32
+  %81 = bitcast float %54 to i32
+  %82 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %74, i32 1
+  %83 = insertelement <16 x i32> %82, i32 %75, i32 2
+  %84 = insertelement <16 x i32> %83, i32 %76, i32 3
+  %85 = insertelement <16 x i32> %84, i32 %77, i32 4
+  %86 = insertelement <16 x i32> %85, i32 %78, i32 5
+  %87 = insertelement <16 x i32> %86, i32 %79, i32 6
+  %88 = insertelement <16 x i32> %87, i32 %80, i32 7
+  %89 = insertelement <16 x i32> %88, i32 %81, i32 8
+  %90 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %89, <8 x i32> %39, <4 x i32> %46, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %91 = extractelement <4 x float> %90, i32 0
+  %92 = fmul float %73, %91
+  %93 = bitcast float %5 to i32
+  %94 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %93, 10
+  %95 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %94, float %92, 11
+  %96 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %95, float %92, 12
+  %97 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %96, float %92, 13
+  %98 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %97, float %92, 14
+  %99 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %98, float %21, 24
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %99
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{}