Index: lib/CodeGen/LiveIntervalAnalysis.cpp =================================================================== --- lib/CodeGen/LiveIntervalAnalysis.cpp +++ lib/CodeGen/LiveIntervalAnalysis.cpp @@ -948,8 +948,12 @@ LiveInterval &LI = LIS.getInterval(Reg); if (LI.hasSubRanges()) { unsigned SubReg = MO.getSubReg(); + LaneBitmask AllSubMask = MRI.getMaxLaneMaskForVReg(Reg); LaneBitmask LaneMask = SubReg ? TRI.getSubRegIndexLaneMask(SubReg) - : MRI.getMaxLaneMaskForVReg(Reg); + : AllSubMask; + // A non-undef subreg def reads other lanes. + if (SubReg != 0 && MO.isDef() && !MO.isUndef()) + LaneMask = AllSubMask; for (LiveInterval::SubRange &S : LI.subranges()) { if ((S.LaneMask & LaneMask) == 0) continue; @@ -988,7 +992,7 @@ dbgs() << ":\t" << LR << '\n'; }); if (SlotIndex::isEarlierInstr(OldIdx, NewIdx)) - handleMoveDown(LR); + handleMoveDown(LR, Reg, LaneMask); else handleMoveUp(LR, Reg, LaneMask); DEBUG(dbgs() << " -->\t" << LR << '\n'); @@ -997,7 +1001,7 @@ /// Update LR to reflect an instruction has been moved downwards from OldIdx /// to NewIdx (OldIdx < NewIdx). - void handleMoveDown(LiveRange &LR) { + void handleMoveDown(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) { LiveRange::iterator E = LR.end(); // Segment going into OldIdx. LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex()); @@ -1034,8 +1038,28 @@ LiveRange::iterator Prev = std::prev(NewIdxIn); Prev->end = NewIdx.getRegSlot(); } - // Extend OldIdxIn. - OldIdxIn->end = Next->start; + // Terminate OldIdxIn. The new end of that segment is not necessarily + // directly adjacent to the start of the new one. Take this case for + // example: + // 1008B %vreg131:sub1 = COPY %vreg54 + // 1040B %vreg131:sub2 = COPY %vreg23 + // 1072B %vreg131:sub3 = COPY %vreg24 + // 1104B %vreg131:sub4 = COPY %vreg25 + // 1200B ... = COPY vreg131 + // The live subrange for vreg131:sub2 will contain + // [...,1008r)[1040r,1200r) + // + // Consider a move of 1008 to after 1072. This will move the def of + // sub1, but the range for other subregisters (including sub2) will + // need to be updated as well. The OldIdxIn for sub2 is the one ending + // at 1008r, Next is the one starting at 1040r. OldIdxIn cannot be + // extended to Next, since 1040r does not actually read sub2. Instead, + // we need to find the last use between the previous def of sub2 and + // the beginning of the next segment. + // + OldIdxIn->end = findLastUseBefore(OldIdxIn->start, + Next->start.getDeadSlot(), + Reg, LaneMask); return; } @@ -1194,7 +1218,8 @@ SlotIndex DefBeforeOldIdx = std::max(OldIdxIn->start.getDeadSlot(), NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber())); - OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask); + OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, OldIdx, + Reg, LaneMask); // Did we have a Def at OldIdx? If not we are done now. OldIdxOut = std::next(OldIdxIn); @@ -1305,38 +1330,41 @@ "Cannot move regmask instruction below another call"); } - // Return the last use of reg between NewIdx and OldIdx. - SlotIndex findLastUseBefore(SlotIndex Before, unsigned Reg, - LaneBitmask LaneMask) { + // Return the last use of reg between Min and Max. + SlotIndex findLastUseBefore(SlotIndex Min, SlotIndex Max, + unsigned Reg, LaneBitmask LaneMask) { if (TargetRegisterInfo::isVirtualRegister(Reg)) { - SlotIndex LastUse = Before; - for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { - if (MO.isUndef()) + SlotIndex LastUse = Min; + for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { + if (!MO.readsReg()) continue; unsigned SubReg = MO.getSubReg(); - if (SubReg != 0 && LaneMask != 0 - && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask) == 0) - continue; + if (SubReg != 0 && LaneMask != 0) { + LaneBitmask SLM = TRI.getSubRegIndexLaneMask(SubReg); + if (MO.isDef()) + SLM = ~SLM & MRI.getMaxLaneMaskForVReg(Reg); + if ((LaneMask & SLM) == 0) + continue; + } const MachineInstr &MI = *MO.getParent(); SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI); - if (InstSlot > LastUse && InstSlot < OldIdx) + if (InstSlot > LastUse && InstSlot < Max) LastUse = InstSlot.getRegSlot(); } return LastUse; } // This is a regunit interval, so scanning the use list could be very - // expensive. Scan upwards from OldIdx instead. - assert(Before < OldIdx && "Expected upwards move"); + // expensive. Scan upwards from Max instead. SlotIndexes *Indexes = LIS.getSlotIndexes(); - MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Before); + MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Min); - // OldIdx may not correspond to an instruction any longer, so set MII to + // Max may not correspond to an instruction any longer, so set MII to // point to the next instruction after OldIdx, or MBB->end(). MachineBasicBlock::iterator MII = MBB->end(); if (MachineInstr *MI = Indexes->getInstructionFromIndex( - Indexes->getNextNonNullIndex(OldIdx))) + Indexes->getNextNonNullIndex(Max))) if (MI->getParent() == MBB) MII = MI; @@ -1346,9 +1374,9 @@ continue; SlotIndex Idx = Indexes->getInstructionIndex(*MII); - // Stop searching when Before is reached. - if (!SlotIndex::isEarlierInstr(Before, Idx)) - return Before; + // Stop searching when Min is reached. + if (!SlotIndex::isEarlierInstr(Min, Idx)) + return Min; // Check if MII uses Reg. for (MIBundleOperands MO(*MII); MO.isValid(); ++MO) @@ -1357,8 +1385,8 @@ TRI.hasRegUnit(MO->getReg(), Reg)) return Idx.getRegSlot(); } - // Didn't reach Before. It must be the first instruction in the block. - return Before; + // Didn't reach Min. It must be the first instruction in the block. + return Min; } }; Index: test/CodeGen/AMDGPU/scheduler-liveness-1.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scheduler-liveness-1.ll @@ -0,0 +1,85 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; Check for a sane output. This testcase used to crash. +; CHECK: image_sample_c + +target triple = "amdgcn--" + +define amdgpu_ps void @main() #0 { +main_body: + %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) + %1 = load <4 x i32>, <4 x i32> addrspace(2)* null, align 16, !invariant.load !0 + %2 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> zeroinitializer, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %3 = extractelement <4 x float> %2, i32 0 + %4 = fmul float %3, 2.000000e+00 + %5 = fadd float %4, -1.000000e+00 + %6 = fmul float %5, undef + %7 = fadd float %6, undef + %8 = call float @llvm.AMDGPU.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) + %9 = fmul float %8, 0.000000e+00 + %10 = fadd float undef, %9 + %11 = fmul float %0, undef + %12 = fadd float %11, undef + %13 = fadd float undef, undef + %14 = bitcast float %12 to i32 + %15 = bitcast float %13 to i32 + %16 = insertelement <4 x i32> undef, i32 %14, i32 0 + %17 = insertelement <4 x i32> %16, i32 %15, i32 1 + %18 = insertelement <4 x i32> %17, i32 undef, i32 2 + %19 = bitcast float %12 to i32 + %20 = insertelement <4 x i32> undef, i32 %19, i32 0 + %21 = insertelement <4 x i32> %20, i32 0, i32 1 + %22 = insertelement <4 x i32> %21, i32 0, i32 2 + %23 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %22, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %24 = extractelement <4 x float> %23, i32 0 + %25 = fadd float undef, %24 + %26 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> zeroinitializer, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %27 = extractelement <4 x float> %26, i32 0 + %28 = fadd float %25, %27 + %29 = fadd float %28, undef + %30 = fadd float undef, undef + %31 = bitcast float %12 to i32 + %32 = bitcast float %30 to i32 + %33 = insertelement <4 x i32> undef, i32 %31, i32 0 + %34 = insertelement <4 x i32> %33, i32 %32, i32 1 + %35 = insertelement <4 x i32> %34, i32 undef, i32 2 + %36 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %35, <8 x i32> zeroinitializer, <4 x i32> %1, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %37 = extractelement <4 x float> %36, i32 0 + %38 = fadd float %29, %37 + %39 = fmul float %38, 1.250000e-01 + %40 = fmul float %39, 8.000000e+00 + br label %LOOP + +LOOP: ; preds = %LOOP, %main_body + %.038 = phi float [ 0x36C0000000000000, %main_body ], [ 0.000000e+00, %LOOP ] + %.5 = phi float [ %40, %main_body ], [ undef, %LOOP ] + %41 = bitcast float %.038 to i32 + %42 = icmp sgt i32 %41, 15 + br i1 %42, label %IF29, label %LOOP + +IF29: ; preds = %LOOP + %43 = fmul float %.5, 3.125000e-02 + %44 = fadd float %43, undef + %45 = call float @llvm.AMDGPU.clamp.(float %44, float 0.000000e+00, float 1.000000e+00) + %46 = fmul float %10, %45 + %47 = fmul float %46, undef + %48 = fadd float %47, undef + %49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %48, 11 + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.AMDGPU.clamp.(float, float, float) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" } +attributes #1 = { nounwind readnone } + +!0 = !{} Index: test/CodeGen/AMDGPU/scheduler-liveness-2.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scheduler-liveness-2.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; Check for a sane output. This testcase used to crash. +; CHECK: tbuffer_store_format_x + +target triple = "amdgcn--" + +define amdgpu_gs void @main(i32 inreg) { +main_body: + %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 36) + %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 40) + %array_vector21 = insertelement <4 x float> , float undef, i32 1 + %array_vector22 = insertelement <4 x float> %array_vector21, float %1, i32 2 + %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3 + %array_vector25 = insertelement <4 x float> , float undef, i32 1 + %array_vector26 = insertelement <4 x float> %array_vector25, float %2, i32 2 + %array_vector27 = insertelement <4 x float> %array_vector26, float 0.000000e+00, i32 3 + %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32> + %3 = extractelement <4 x i32> %bc52, i32 undef + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %3, i32 1, i32 64, i32 %0, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %bc53 = bitcast <4 x float> %array_vector27 to <4 x i32> + %4 = extractelement <4 x i32> %bc53, i32 undef + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %4, i32 1, i32 76, i32 %0, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }