Index: lib/CodeGen/RegisterCoalescer.cpp
===================================================================
--- lib/CodeGen/RegisterCoalescer.cpp
+++ lib/CodeGen/RegisterCoalescer.cpp
@@ -2339,8 +2339,41 @@
       continue;
     DEBUG(dbgs() << "\t\tconflict at " << PrintReg(Reg) << ':' << i
                  << '@' << LR.getValNumInfo(i)->def << '\n');
-    if (SubRangeJoin)
-      return false;
+    if (SubRangeJoin) {
+      // Unresolved subrange conflicts may remain here as a result of merging
+      // registers where subregister definitions of one register are
+      // overwritten by corresponding definitions of the other register.
+      // For example, in this case, whem coalescing vreg140 into vreg131:
+      //   704B     %vreg140<def> = ...
+      //   720B     %vreg131<def> = COPY %vreg140
+      //   736B     %vreg131:sub1<def> = COPY %vreg48
+      //   768B     %vreg131:sub2<def> = COPY %vreg23
+      //   800B     %vreg131:sub3<def> = COPY %vreg24
+      //   832B     %vreg131:sub4<def> = COPY %vreg25
+      //   864B     %vreg131:sub5<def> = COPY %vreg26
+      //   896B     %vreg131:sub6<def> = COPY %vreg40
+      //   928B     %vreg131:sub7<def> = COPY %vreg42
+      //   960B     %vreg131:sub8<def> = COPY %vreg44
+      //   976B     ... = use %vreg131
+      //   1008B    %vreg140:sub1<def> = COPY %vreg54
+      //   1040B    %vreg140:sub2<def> = COPY %vreg23
+      //   1072B    %vreg140:sub3<def> = COPY %vreg24
+      //   1104B    %vreg140:sub4<def> = COPY %vreg25
+      //   1136B    %vreg140:sub5<def> = COPY %vreg26
+      //   1168B    %vreg140:sub6<def> = COPY %vreg50
+      //   1200B    %vreg140:sub7<def> = COPY %vreg51
+      //   1232B    %vreg140:sub8<def> = COPY %vreg52
+      //   1248B    ... = use %vreg140
+      // The conflict resolution for the main live ranges of both registers
+      // can determine that the coalescing is legal and may proceed, but the
+      // subregister ranges will still contain conflicts, since the subranges
+      // will overlap between the two virtual registers. Since repeating of
+      // the resolution code below for subregisters could only result in
+      // CR_Replace, and the legality has already been determined, assume
+      // the resolution to be CR_Replace without repeating the work.
+      V.Resolution = CR_Replace;
+      continue;
+    }
 
     ++NumLaneConflicts;
     assert(V.OtherVNI && "Inconsistent conflict resolution.");
@@ -2700,6 +2733,7 @@
     dbgs() << ":  " << LRange << '\n';
   });
   LIS->extendToIndices(LRange, EndPoints);
+  ShrinkMask |= LaneMask;
 }
 
 void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
Index: test/CodeGen/AMDGPU/tex-miplevel-selection-coalescer.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/tex-miplevel-selection-coalescer.ll
@@ -0,0 +1,104 @@
+; RUN: llc -march=amdgcn -enable-misched=0 < %s | FileCheck %s
+; Test that register coalescer does not crash on this code. The scheduler
+; also crashed, so disable it to isolate the coalescing problem.
+
+; Check for a sane output instead of a crash.
+; CHECK: image_sample_c_d_o
+
+target triple = "amdgcn--"
+
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
+main_body:
+  %23 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
+  %24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !invariant.load !0
+  %25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %24, i32 48)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %24, i32 52)
+  %29 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
+  %30 = load <8 x i32>, <8 x i32> addrspace(2)* %29, align 32, !invariant.load !0
+  %31 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
+  %32 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %31, i64 0, i64 3, !amdgpu.uniform !0
+  %33 = load <4 x i32>, <4 x i32> addrspace(2)* %32, align 16, !invariant.load !0
+  %34 = extractelement <8 x i32> %30, i32 7
+  %35 = extractelement <4 x i32> %33, i32 0
+  %36 = and i32 %35, %34
+  %37 = insertelement <4 x i32> %33, i32 %36, i32 0
+  %38 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 2, !amdgpu.uniform !0
+  %39 = load <8 x i32>, <8 x i32> addrspace(2)* %38, align 32, !invariant.load !0
+  %40 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
+  %41 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %40, i64 0, i64 7, !amdgpu.uniform !0
+  %42 = load <4 x i32>, <4 x i32> addrspace(2)* %41, align 16, !invariant.load !0
+  %43 = extractelement <8 x i32> %39, i32 7
+  %44 = extractelement <4 x i32> %42, i32 0
+  %45 = and i32 %44, %43
+  %46 = insertelement <4 x i32> %42, i32 %45, i32 0
+  %47 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %8)
+  %48 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %8)
+  %49 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %6, <2 x i32> %8)
+  %50 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %6, <2 x i32> %8)
+  %51 = fadd float %50, 0xBFA99999A0000000
+  %52 = fadd float %47, 0.000000e+00
+  %53 = fadd float %48, 0.000000e+00
+  %54 = fadd float %49, 0.000000e+00
+  %55 = fadd float %50, 0x3FA99999A0000000
+  %56 = bitcast float %51 to i32
+  %57 = bitcast float %25 to i32
+  %58 = bitcast float %26 to i32
+  %59 = bitcast float %27 to i32
+  %60 = bitcast float %28 to i32
+  %61 = bitcast float %47 to i32
+  %62 = bitcast float %48 to i32
+  %63 = bitcast float %49 to i32
+  %64 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %56, i32 1
+  %65 = insertelement <16 x i32> %64, i32 %57, i32 2
+  %66 = insertelement <16 x i32> %65, i32 %58, i32 3
+  %67 = insertelement <16 x i32> %66, i32 %59, i32 4
+  %68 = insertelement <16 x i32> %67, i32 %60, i32 5
+  %69 = insertelement <16 x i32> %68, i32 %61, i32 6
+  %70 = insertelement <16 x i32> %69, i32 %62, i32 7
+  %71 = insertelement <16 x i32> %70, i32 %63, i32 8
+  %72 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %71, <8 x i32> %30, <4 x i32> %37, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %73 = extractelement <4 x float> %72, i32 0
+  %74 = bitcast float %55 to i32
+  %75 = bitcast float %25 to i32
+  %76 = bitcast float %26 to i32
+  %77 = bitcast float %27 to i32
+  %78 = bitcast float %28 to i32
+  %79 = bitcast float %52 to i32
+  %80 = bitcast float %53 to i32
+  %81 = bitcast float %54 to i32
+  %82 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %74, i32 1
+  %83 = insertelement <16 x i32> %82, i32 %75, i32 2
+  %84 = insertelement <16 x i32> %83, i32 %76, i32 3
+  %85 = insertelement <16 x i32> %84, i32 %77, i32 4
+  %86 = insertelement <16 x i32> %85, i32 %78, i32 5
+  %87 = insertelement <16 x i32> %86, i32 %79, i32 6
+  %88 = insertelement <16 x i32> %87, i32 %80, i32 7
+  %89 = insertelement <16 x i32> %88, i32 %81, i32 8
+  %90 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %89, <8 x i32> %39, <4 x i32> %46, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %91 = extractelement <4 x float> %90, i32 0
+  %92 = fmul float %73, %91
+  %93 = bitcast float %5 to i32
+  %94 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %93, 10
+  %95 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %94, float %92, 11
+  %96 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %95, float %92, 12
+  %97 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %96, float %92, 13
+  %98 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %97, float %92, 14
+  %99 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %98, float %21, 24
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %99
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{}