Index: lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- lib/CodeGen/RegisterCoalescer.cpp +++ lib/CodeGen/RegisterCoalescer.cpp @@ -1220,6 +1220,34 @@ SR->createDeadDef(DefIndex, Alloc); } } + + // Make sure that the subrange for resultant undef is removed + // For example: + // vreg1:sub1 = LOAD CONSTANT 1 + // vreg2 = COPY vreg1 + // ==> + // vreg2:sub1 = LOAD CONSTANT 1 + // ; Correct but need to remove the subrange for vreg2:sub0 + // ; as it is now undef + if (NewIdx != 0 && DstInt.hasSubRanges()) { + // The affected subregister segments can be removed. + SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI); + LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(NewIdx); + bool UpdatedSubRanges = false; + for (LiveInterval::SubRange &SR : DstInt.subranges()) { + if ((SR.LaneMask & DstMask).none()) { + DEBUG(dbgs() << "Removing undefined SubRange " + << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n"); + // VNI is in ValNo - remove any segments in this SubRange that have this ValNo + if (VNInfo *RmValNo = SR.getVNInfoAt(CurrIdx.getRegSlot())) { + SR.removeValNo(RmValNo); + UpdatedSubRanges = true; + } + } + } + if (UpdatedSubRanges) + DstInt.removeEmptySubRanges(); + } } else if (NewMI.getOperand(0).getReg() != CopyDstReg) { // The New instruction may be defining a sub-register of what's actually // been asked for. If so it must implicitly define the whole thing. Index: test/CodeGen/AMDGPU/regcoal-subrange-join.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/regcoal-subrange-join.ll @@ -0,0 +1,75 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s + +; See bug http://llvm.org/PR33524 for details of the problem being checked here +; This test will provoke a subrange join (see annotations below) during simple register coalescing +; Without a fix for PR33524 this causes an unreachable in SubRange Join +; The test looks longer than might be necessary, but cutting it down further stops the problem from appearing +; A mir test might also be preferable, but PseudoSourceValues for the llvm.amdgcn.buffer.load intrinsics make +; this tricky + +; GCN-LABEL: @regcoal-subrange-join +; GCN-DAG: s_mov_b32 s[[SUB0:[0-9]+]], {{s[0-9]+}} +; GCN-DAG: s_mov_b32 s[[SUB1:[0-9]+]], 1 +; GCN-DAG: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SUB0]]:[[SUB1]]{{\]}}, 0x0 + +define amdgpu_vs void @regcoal-subrange-join(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 %arg6) local_unnamed_addr #0 { +.entry: + %.4.vec.insert9 = insertelement <2 x i32> , i32 %arg2, i32 0 + %tmp = bitcast <2 x i32> %.4.vec.insert9 to i64 + %tmp7 = inttoptr i64 %tmp to [4294967295 x i8] addrspace(2)* + ; SubRange join fails after a certain amount of coalescing has taken place for the + ; construction of %.4.vec.insert after a remat of one of the src operands + %.4.vec.insert = insertelement <2 x i32> , i32 %arg5, i32 0 + %tmp8 = bitcast <2 x i32> %.4.vec.insert to i64 + %tmp9 = inttoptr i64 %tmp8 to [16 x <4 x i32>] addrspace(2)* + %tmp10 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %tmp9, i64 0, i64 0 + %tmp11 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp10, align 16 + %tmp12 = insertelement <4 x i32> %tmp11, i32 491436, i32 3 + %tmp13 = tail call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp12, i32 undef, i32 0, i1 false, i1 false) #1 + %tmp14 = inttoptr i64 %tmp to <4 x i32> addrspace(2)* + %tmp15 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp14, align 16 + %tmp16 = tail call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp15, i32 0, i32 0, i1 false, i1 false) #0 + %tmp17 = bitcast float %tmp16 to i32 + br i1 undef, label %.lr.ph6.preheader, label %.preheader + +.lr.ph6.preheader: ; preds = %.entry + %tmp18 = fadd <4 x float> %tmp13, + %tmp19 = icmp slt i32 1, %tmp17 + br label %.preheader + +.preheader: ; preds = %.lr.ph6.preheader, %.entry + %f.0.lcssa = phi <4 x float> [ %tmp13, %.entry ], [ %tmp18, %.lr.ph6.preheader ] + %.lcssa = phi i32 [ 1, %.entry ], [ 0, %.lr.ph6.preheader ] + %tmp20 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp7, i64 0, i64 16 + %tmp21 = bitcast i8 addrspace(2)* %tmp20 to <4 x i32> addrspace(2)* + %tmp22 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp21, align 16 + %tmp23 = tail call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp22, i32 0, i32 0, i1 false, i1 false) #0 + %tmp24 = bitcast float %tmp23 to i32 + br label %.lr.ph + +.lr.ph: ; preds = %.lr.ph, %.preheader + %k.14 = phi i32 [ %tmp25, %.lr.ph ], [ %.lcssa, %.preheader ] + %f.13 = phi <4 x float> [ %tmp26, %.lr.ph ], [ %f.0.lcssa, %.preheader ] + %tmp25 = add nsw i32 %k.14, 1 + %tmp26 = fadd <4 x float> %f.13, + %tmp27 = icmp slt i32 %tmp25, %tmp24 + br i1 %tmp27, label %.lr.ph, label %._crit_edge.loopexit + +._crit_edge.loopexit: ; preds = %.lr.ph + %.lcssa30 = phi <4 x float> [ %tmp26, %.lr.ph ] + %tmp28 = extractelement <4 x float> %.lcssa30, i32 2 + tail call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float undef, float undef, float %tmp28, float undef, i1 false, i1 false) #0 + ret void +} + +; Function Attrs: nounwind readonly +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 + +; Function Attrs: nounwind readonly +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +; Function Attrs: nounwind +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly }