Index: lib/CodeGen/RegisterCoalescer.cpp
===================================================================
--- lib/CodeGen/RegisterCoalescer.cpp
+++ lib/CodeGen/RegisterCoalescer.cpp
@@ -1220,6 +1220,29 @@
         SR->createDeadDef(DefIndex, Alloc);
       }
     }
+
+    // Make sure that the subrange for resultant undef is removed
+    // For example:
+    //   vreg1:sub1<def,read-undef> = LOAD CONSTANT 1
+    //   vreg2<def> = COPY vreg1
+    //     ; vreg2:sub0 is actually undef but subrange exists in LiveRange for lane
+    // ==>
+    //   vreg2:sub1<def, read-undef> = LOAD CONSTANT 1
+    //     ; Correct but need to remove the subrange for sub0
+    if (NewIdx != 0 && DstIdx == 0 && DstInt.hasSubRanges()) {
+      // The affected subregister segments can be removed.
+      SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
+      LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(NewIdx);
+      for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+        if ((SR.LaneMask & DstMask).none()) {
+          DEBUG(dbgs() << "SubRange containing an undef tagged as def "
+                << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n");
+          // VNI is in ValNo - remove any segments in this SubRange that have this ValNo
+          VNInfo *RmValNo = SR.Query(CurrIdx).valueOutOrDead();
+          SR.removeValNo(RmValNo);
+        }
+      }
+    }
   } else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
     // The New instruction may be defining a sub-register of what's actually
     // been asked for. If so it must implicitly define the whole thing.
Index: test/CodeGen/AMDGPU/regcoal-subrange-join.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/regcoal-subrange-join.ll
@@ -0,0 +1,80 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+; See bug http://llvm.org/PR33524 for details of the problem being checked here
+; This test will provoke a subrange join (see annotations below) during simple register coalescing
+; Without a fix for PR33524 this causes an unreachable in SubRange Join
+; The test looks longer than might be necessary, but cutting it down further stops the problem from appearing
+; A mir test might also be preferable, but PseudoSourceValues for the llvm.amdgcn.buffer.load intrinsics make
+; this tricky
+
+; GCN-LABEL: @regcoal-subrange-join
+; GCN-DAG: s_mov_b32 s[[SUB0:[0-9]+]], {{s[0-9]+}}
+; GCN-DAG: s_mov_b32 s[[SUB1:[0-9]+]], 1
+; GCN-DAG: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SUB0]]:[[SUB1]]{{\]}}, 0x0
+
+define amdgpu_vs void @regcoal-subrange-join(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 %arg6) local_unnamed_addr #0 {
+.entry:
+  %.4.vec.insert9 = insertelement <2 x i32> <i32 undef, i32 1>, i32 %arg2, i32 0
+  %tmp = bitcast <2 x i32> %.4.vec.insert9 to i64
+  %tmp7 = inttoptr i64 %tmp to [4294967295 x i8] addrspace(2)*
+  ; SubRange join fails after a certain amount of coalescing has taken place for the
+  ; construction of %.4.vec.insert after a remat of one of the src operands
+  %.4.vec.insert = insertelement <2 x i32> <i32 undef, i32 1>, i32 %arg5, i32 0
+  %tmp8 = bitcast <2 x i32> %.4.vec.insert to i64
+  %tmp9 = inttoptr i64 %tmp8 to [16 x <4 x i32>] addrspace(2)*
+  %tmp10 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %tmp9, i64 0, i64 0, !amdgpu.uniform !1
+  %tmp11 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp10, align 16, !invariant.load !1
+  %tmp12 = insertelement <4 x i32> %tmp11, i32 491436, i32 3
+  %tmp13 = tail call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp12, i32 undef, i32 0, i1 false, i1 false) #1
+  %tmp14 = inttoptr i64 %tmp to <4 x i32> addrspace(2)*
+  %tmp15 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp14, align 16
+  %tmp16 = tail call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp15, i32 0, i32 0, i1 false, i1 false) #0
+  %tmp17 = bitcast float %tmp16 to i32
+  br i1 undef, label %.lr.ph6.preheader, label %.preheader
+
+.lr.ph6.preheader:                                ; preds = %.entry
+  %tmp18 = fadd <4 x float> %tmp13, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %tmp19 = icmp slt i32 1, %tmp17
+  br label %.preheader
+
+.preheader:                                       ; preds = %.lr.ph6.preheader, %.entry
+  %f.0.lcssa = phi <4 x float> [ %tmp13, %.entry ], [ %tmp18, %.lr.ph6.preheader ]
+  %.lcssa = phi i32 [ 1, %.entry ], [ 0, %.lr.ph6.preheader ]
+  %tmp20 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp7, i64 0, i64 16
+  %tmp21 = bitcast i8 addrspace(2)* %tmp20 to <4 x i32> addrspace(2)*, !amdgpu.uniform !1
+  %tmp22 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp21, align 16
+  %tmp23 = tail call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp22, i32 0, i32 0, i1 false, i1 false) #0
+  %tmp24 = bitcast float %tmp23 to i32
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph, %.preheader
+  %k.14 = phi i32 [ %tmp25, %.lr.ph ], [ %.lcssa, %.preheader ]
+  %f.13 = phi <4 x float> [ %tmp26, %.lr.ph ], [ %f.0.lcssa, %.preheader ]
+  %tmp25 = add nsw i32 %k.14, 1
+  %tmp26 = fadd <4 x float> %f.13, <float 0xBFC99999A0000000, float 0xBFC99999A0000000, float 0xBFC99999A0000000, float 0xBFC99999A0000000>
+  %tmp27 = icmp slt i32 %tmp25, %tmp24
+  br i1 %tmp27, label %.lr.ph, label %._crit_edge.loopexit
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  %.lcssa30 = phi <4 x float> [ %tmp26, %.lr.ph ]
+  %tmp28 = extractelement <4 x float> %.lcssa30, i32 2
+  tail call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float undef, float undef, float %tmp28, float undef, i1 false, i1 false) #0
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
+!spirv.Generator = !{!0}
+
+!0 = !{i16 8, i16 1}
+!1 = !{}