Index: lib/CodeGen/LiveIntervalAnalysis.cpp =================================================================== --- lib/CodeGen/LiveIntervalAnalysis.cpp +++ lib/CodeGen/LiveIntervalAnalysis.cpp @@ -506,7 +506,7 @@ // Visit all instructions reading Reg. SlotIndex LastIdx; - for (MachineOperand &MO : MRI->reg_operands(Reg)) { + for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) { MachineInstr *UseMI = MO.getParent(); if (UseMI->isDebugValue() || !MO.readsReg()) continue; @@ -514,8 +514,6 @@ unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg); - if (MO.isDef()) - LaneMask = ~LaneMask & MRI->getMaxLaneMaskForVReg(Reg); if ((LaneMask & SR.LaneMask) == 0) continue; } Index: lib/CodeGen/LiveRangeCalc.h =================================================================== --- lib/CodeGen/LiveRangeCalc.h +++ lib/CodeGen/LiveRangeCalc.h @@ -160,6 +160,7 @@ /// all uses must be jointly dominated by the definitions from @p LR /// together with definitions of other lanes where @p LR becomes undefined /// (via operands). + /// If @p LR is a main range, the @p LaneMask should be set to ~0. void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask, LiveInterval *LI = nullptr); Index: lib/CodeGen/LiveRangeCalc.cpp =================================================================== --- lib/CodeGen/LiveRangeCalc.cpp +++ lib/CodeGen/LiveRangeCalc.cpp @@ -163,13 +163,14 @@ LI->computeSubRangeUndefs(Undefs, Mask, *MRI, *Indexes); // Visit all operands that read Reg. This may include partial defs. + bool IsSubRange = (Mask != ~0U); const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { // Clear all kill flags. They will be reinserted after register allocation // by LiveIntervalAnalysis::addKillFlags(). if (MO.isUse()) MO.setIsKill(false); - if (!MO.readsReg()) + if (!MO.readsReg() || (IsSubRange && MO.isDef())) continue; unsigned SubReg = MO.getSubReg(); Index: lib/CodeGen/SplitKit.cpp =================================================================== --- lib/CodeGen/SplitKit.cpp +++ lib/CodeGen/SplitKit.cpp @@ -1206,7 +1206,8 @@ // defining the register. This is because a operand // will create an "undef" point, and we cannot extend any subranges // until all of them have been accounted for. - ExtPoints.push_back(ExtPoint(MO, RegIdx, Next)); + if (MO.isUse()) + ExtPoints.push_back(ExtPoint(MO, RegIdx, Next)); } else { LiveRangeCalc &LRC = getLRCalc(RegIdx); LRC.extend(LI, Next, 0, ArrayRef()); @@ -1221,10 +1222,6 @@ unsigned Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg(); LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub) : MRI.getMaxLaneMaskForVReg(Reg); - // If this is a non-read-undef definition of a sub-register, extend - // subranges for everything except that sub-register. - if (Sub != 0 && EP.MO.isDef()) - LM = MRI.getMaxLaneMaskForVReg(Reg) & ~LM; for (LiveInterval::SubRange &S : LI.subranges()) { if (!(S.LaneMask & LM)) continue; Index: test/CodeGen/AMDGPU/coalescer-subreg-join.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/coalescer-subreg-join.mir @@ -0,0 +1,75 @@ +# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s +# Check that %11 and %20 have been coalesced. +# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG:[0-9]+]] +# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG]] + +--- +name: main +alignment: 0 +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_256 } + - { id: 4, class: sreg_128 } + - { id: 5, class: sreg_256 } + - { id: 6, class: sreg_128 } + - { id: 7, class: sreg_512 } + - { id: 9, class: vreg_512 } + - { id: 11, class: vreg_512 } + - { id: 18, class: vgpr_32 } + - { id: 20, class: vreg_512 } + - { id: 27, class: vgpr_32 } +liveins: + - { reg: '%sgpr2_sgpr3', virtual-reg: '%0' } + - { reg: '%vgpr2', virtual-reg: '%1' } + - { reg: '%vgpr3', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0: + liveins: %sgpr2_sgpr3, %vgpr2, %vgpr3 + + %0 = COPY %sgpr2_sgpr3 + %1 = COPY %vgpr2 + %2 = COPY %vgpr3 + %3 = S_LOAD_DWORDX8_IMM %0, 0 + %4 = S_LOAD_DWORDX4_IMM %0, 12 + %5 = S_LOAD_DWORDX8_IMM %0, 16 + %6 = S_LOAD_DWORDX4_IMM %0, 28 + undef %7.sub0 = S_MOV_B32 212739 + %20 = COPY %7 + %11 = COPY %20 + %11.sub1 = COPY %1 + %11.sub2 = COPY %1 + %11.sub3 = COPY %1 + %11.sub4 = COPY %1 + %11.sub5 = COPY %1 + %11.sub6 = COPY %1 + %11.sub7 = COPY %1 + %11.sub8 = COPY %1 + dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec + %20.sub1 = COPY %2 + %20.sub2 = COPY %2 + %20.sub3 = COPY %2 + %20.sub4 = COPY %2 + %20.sub5 = COPY %2 + %20.sub6 = COPY %2 + %20.sub7 = COPY %2 + %20.sub8 = COPY %2 + dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec + +... Index: test/CodeGen/AMDGPU/scheduler-liveness-1.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scheduler-liveness-1.ll @@ -0,0 +1,85 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; Check for a sane output. This testcase used to crash. +; CHECK: image_sample_c + +target triple = "amdgcn--" + +define amdgpu_ps void @main() #0 { +main_body: + %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) + %1 = load <4 x i32>, <4 x i32> addrspace(2)* null, align 16, !invariant.load !0 + %2 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> zeroinitializer, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %3 = extractelement <4 x float> %2, i32 0 + %4 = fmul float %3, 2.000000e+00 + %5 = fadd float %4, -1.000000e+00 + %6 = fmul float %5, undef + %7 = fadd float %6, undef + %8 = call float @llvm.AMDGPU.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) + %9 = fmul float %8, 0.000000e+00 + %10 = fadd float undef, %9 + %11 = fmul float %0, undef + %12 = fadd float %11, undef + %13 = fadd float undef, undef + %14 = bitcast float %12 to i32 + %15 = bitcast float %13 to i32 + %16 = insertelement <4 x i32> undef, i32 %14, i32 0 + %17 = insertelement <4 x i32> %16, i32 %15, i32 1 + %18 = insertelement <4 x i32> %17, i32 undef, i32 2 + %19 = bitcast float %12 to i32 + %20 = insertelement <4 x i32> undef, i32 %19, i32 0 + %21 = insertelement <4 x i32> %20, i32 0, i32 1 + %22 = insertelement <4 x i32> %21, i32 0, i32 2 + %23 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %22, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %24 = extractelement <4 x float> %23, i32 0 + %25 = fadd float undef, %24 + %26 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> zeroinitializer, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %27 = extractelement <4 x float> %26, i32 0 + %28 = fadd float %25, %27 + %29 = fadd float %28, undef + %30 = fadd float undef, undef + %31 = bitcast float %12 to i32 + %32 = bitcast float %30 to i32 + %33 = insertelement <4 x i32> undef, i32 %31, i32 0 + %34 = insertelement <4 x i32> %33, i32 %32, i32 1 + %35 = insertelement <4 x i32> %34, i32 undef, i32 2 + %36 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %35, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %37 = extractelement <4 x float> %36, i32 0 + %38 = fadd float %29, %37 + %39 = fmul float %38, 1.250000e-01 + %40 = fmul float %39, 8.000000e+00 + br label %LOOP + +LOOP: ; preds = %LOOP, %main_body + %.038 = phi float [ 0x36C0000000000000, %main_body ], [ 0.000000e+00, %LOOP ] + %.5 = phi float [ %40, %main_body ], [ undef, %LOOP ] + %41 = bitcast float %.038 to i32 + %42 = icmp sgt i32 %41, 15 + br i1 %42, label %IF29, label %LOOP + +IF29: ; preds = %LOOP + %43 = fmul float %.5, 3.125000e-02 + %44 = fadd float %43, undef + %45 = call float @llvm.AMDGPU.clamp.(float %44, float 0.000000e+00, float 1.000000e+00) + %46 = fmul float %10, %45 + %47 = fmul float %46, undef + %48 = fadd float %47, undef + %49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %48, 11 + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.AMDGPU.clamp.(float, float, float) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" } +attributes #1 = { nounwind readnone } + +!0 = !{} Index: test/CodeGen/AMDGPU/scheduler-liveness-2.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scheduler-liveness-2.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; Check for a sane output. This testcase used to crash. +; CHECK: tbuffer_store_format_x + +target triple = "amdgcn--" + +define amdgpu_gs void @main(i32 inreg) { +main_body: + %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 36) + %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 40) + %array_vector21 = insertelement <4 x float> , float undef, i32 1 + %array_vector22 = insertelement <4 x float> %array_vector21, float %1, i32 2 + %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3 + %array_vector25 = insertelement <4 x float> , float undef, i32 1 + %array_vector26 = insertelement <4 x float> %array_vector25, float %2, i32 2 + %array_vector27 = insertelement <4 x float> %array_vector26, float 0.000000e+00, i32 3 + %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32> + %3 = extractelement <4 x i32> %bc52, i32 undef + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %3, i32 1, i32 64, i32 %0, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %bc53 = bitcast <4 x float> %array_vector27 to <4 x i32> + %4 = extractelement <4 x i32> %bc53, i32 undef + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %4, i32 1, i32 76, i32 %0, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/unigine-liveness-crash.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/unigine-liveness-crash.ll @@ -0,0 +1,140 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s +; Check for a sane output. This testcase used to crash. +; CHECK: image_sample_c + +target triple = "amdgcn--" + +@ddxy_lds = external addrspace(3) global [64 x i32] + +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 { +main_body: + %23 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %6, <2 x i32> %8) + %24 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %25 = extractelement <4 x float> %24, i32 3 + %26 = fmul float %25, undef + %27 = fmul float undef, %23 + %28 = fadd float %27, undef + %29 = bitcast float %28 to i32 + %30 = insertelement <4 x i32> undef, i32 %29, i32 0 + %31 = insertelement <4 x i32> %30, i32 0, i32 1 + %32 = insertelement <4 x i32> %31, i32 undef, i32 2 + %33 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %34 = extractelement <4 x float> %33, i32 0 + %35 = fadd float undef, %34 + %36 = fadd float %35, undef + %37 = fadd float %36, undef + %38 = fadd float %37, undef + %39 = fadd float %38, undef + %40 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %41 = extractelement <4 x float> %40, i32 0 + %42 = extractelement <4 x float> %40, i32 1 + %43 = extractelement <4 x float> %40, i32 2 + %44 = extractelement <4 x float> %40, i32 3 + %45 = fmul float %41, undef + %46 = fmul float %42, undef + %47 = fmul float %43, undef + %48 = fmul float %44, undef + %49 = fadd float undef, %45 + %50 = fadd float undef, %46 + %51 = bitcast float %28 to i32 + %52 = bitcast float %49 to i32 + %53 = bitcast float %50 to i32 + %54 = insertelement <4 x i32> undef, i32 %51, i32 0 + %55 = insertelement <4 x i32> %54, i32 %52, i32 1 + %56 = insertelement <4 x i32> %55, i32 %53, i32 2 + %57 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %56, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %58 = extractelement <4 x float> %57, i32 0 + %59 = fadd float %39, %58 + %60 = fadd float undef, %47 + %61 = fadd float undef, %48 + %62 = bitcast float %60 to i32 + %63 = bitcast float %61 to i32 + %64 = insertelement <4 x i32> undef, i32 %62, i32 1 + %65 = insertelement <4 x i32> %64, i32 %63, i32 2 + %66 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %65, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %67 = extractelement <4 x float> %66, i32 0 + %68 = fadd float %59, %67 + %69 = fmul float %68, 1.250000e-01 + %70 = fmul float %69, undef + %71 = fcmp une float %70, 0.000000e+00 + br i1 %71, label %IF26, label %ENDIF25 + +IF26: ; preds = %main_body + %72 = bitcast float %28 to i32 + %73 = insertelement <4 x i32> undef, i32 %72, i32 0 + br label %LOOP + +ENDIF25: ; preds = %IF29, %main_body + %.4 = phi float [ %85, %IF29 ], [ %69, %main_body ] + %74 = fadd float %.4, undef + %75 = call float @llvm.AMDGPU.clamp.(float %74, float 0.000000e+00, float 1.000000e+00) + %76 = fmul float undef, %75 + %77 = fmul float %76, undef + %78 = fadd float %77, undef + %79 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %78, 11 + %80 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %79, float undef, 12 + %81 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %80, float undef, 13 + %82 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %81, float %26, 14 + %83 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %82, float undef, 15 + %84 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %83, float %21, 24 + ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %84 + +LOOP: ; preds = %ENDIF28, %IF26 + %.5 = phi float [ undef, %IF26 ], [ %90, %ENDIF28 ] + br i1 false, label %IF29, label %ENDIF28 + +IF29: ; preds = %LOOP + %85 = fmul float %.5, 3.125000e-02 + br label %ENDIF25 + +ENDIF28: ; preds = %LOOP + %86 = insertelement <4 x i32> %73, i32 undef, i32 1 + %87 = insertelement <4 x i32> %86, i32 undef, i32 2 + %88 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %87, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %89 = extractelement <4 x float> %88, i32 0 + %90 = fadd float undef, %89 + br label %LOOP +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 + +; Function Attrs: convergent nounwind readnone +declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.AMDGPU.clamp.(float, float, float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.sqrt.f32(float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.pow.f32(float, float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.fabs.f32(float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" } +attributes #1 = { nounwind readnone } +attributes #2 = { convergent nounwind readnone }