Index: lib/CodeGen/LiveIntervals.cpp =================================================================== --- lib/CodeGen/LiveIntervals.cpp +++ lib/CodeGen/LiveIntervals.cpp @@ -1296,6 +1296,36 @@ if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdx, OldIdxIn->end)) OldIdxIn->end = NewIdx.getRegSlot(); } + } else if (OldIdxIn != E + && SlotIndex::isEarlierInstr(NewIdxOut->start, NewIdx) + && SlotIndex::isEarlierInstr(NewIdx, NewIdxOut->end)) { + // OldIdxVNI is a dead def that has been moved into the middle of + // another value in LR. That can happen when LR is a whole register, + // but the dead def is a write to a subreg that is dead at NewIdx. + // The dead def may have been moved across other values + // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut) + // down one position. + // |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - | + // => |- X0/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -| + std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut)); + // Modify the segment at NewIdxOut and the following segment to meet at + // the point of the dead def, with the following segment getting + // OldIdxVNI as its value number. + *NewIdxOut = LiveRange::Segment( + NewIdxOut->start, NewIdxDef.getRegSlot(), NewIdxOut->valno); + *(NewIdxOut + 1) = LiveRange::Segment( + NewIdxDef.getRegSlot(), (NewIdxOut + 1)->end, OldIdxVNI); + OldIdxVNI->def = NewIdxDef; + // Modify subsequent segments to be defined by the moved def OldIdxVNI. + for (auto Idx = NewIdxOut + 2; Idx <= OldIdxOut; ++Idx) + Idx->valno = OldIdxVNI; + // Aggressively remove all dead flags from the former dead definition. + // Kill/dead flags shouldn't be used while live intervals exist; they + // will be reinserted by VirtRegRewriter. + if (MachineInstr *KillMI = LIS.getInstructionFromIndex(NewIdx)) + for (MIBundleOperands MO(*KillMI); MO.isValid(); ++MO) + if (MO->isReg() && !MO->isUse()) + MO->setIsDead(false); } else { // OldIdxVNI is a dead def. It may have been moved across other values // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut) Index: test/CodeGen/AMDGPU/machine-scheduler-move-dead-def-up.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/machine-scheduler-move-dead-def-up.ll @@ -0,0 +1,274 @@ +; RUN: llc -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; This bugpoint cutdown would assert in machine scheduling when trying to move a +; dead def of a subreg up into the middle of a segment. + +; GCN-LABEL: {{^}}_amdgpu_cs_main: + +source_filename = "../test/CodeGen/AMDGPU/machine-scheduler-move-dead-def-up.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" +target triple = "amdgcn--amdpal" + +; Function Attrs: nounwind readnone speculatable +declare float @llvm.floor.f32(float) #0 + +; Function Attrs: nounwind +define dllexport amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %arg, <3 x i32> %arg1) local_unnamed_addr #1 { +.entry: + %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract.rhs = extractelement <3 x i32> %arg1, i32 0 + %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract = add i32 0, %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract.rhs + br i1 undef, label %bb, label %bb2 + +bb: ; preds = %.entry + br label %bb2 + +bb2: ; preds = %bb, %.entry + br i1 undef, label %bb3, label %"SimulateParticle(i1;f1;.exit" + +bb3: ; preds = %bb2 + br i1 undef, label %bb5, label %bb4 + +bb4: ; preds = %bb3 + br label %bb5 + +bb5: ; preds = %bb4, %bb3 + br i1 undef, label %"SimulateParticle(i1;f1;.exit", label %bb6 + +bb6: ; preds = %bb5 + br i1 undef, label %"SimulateParticle(i1;f1;.exit", label %bb7 + +bb7: ; preds = %bb6 + br i1 undef, label %"SimulateParticle(i1;f1;.exit", label %bb8 + +bb8: ; preds = %bb7 + br i1 undef, label %._crit_edge, label %bb9 + +._crit_edge: ; preds = %bb8 + br label %bb32 + +bb9: ; preds = %bb8 + switch i32 undef, label %bb10 [ + i32 0, label %bb11 + i32 1, label %bb12 + i32 2, label %bb13 + i32 3, label %bb18 + i32 4, label %bb19 + i32 5, label %bb20 + ] + +bb10: ; preds = %bb9 + br label %"Distribute(i1;vf4;.exit.i" + +bb11: ; preds = %bb9 + br label %"Distribute(i1;vf4;.exit.i" + +bb12: ; preds = %bb9 + br label %"Distribute(i1;vf4;.exit.i" + +bb13: ; preds = %bb9 + br i1 undef, label %bb14, label %bb15 + +bb14: ; preds = %bb13 + br label %"Distribute(i1;vf4;.exit.i" + +bb15: ; preds = %bb13 + br i1 undef, label %bb16, label %bb17 + +bb16: ; preds = %bb15 + br label %"Distribute(i1;vf4;.exit.i" + +bb17: ; preds = %bb15 + br label %"Distribute(i1;vf4;.exit.i" + +bb18: ; preds = %bb9 + br label %"Distribute(i1;vf4;.exit.i" + +bb19: ; preds = %bb9 + br label %"Distribute(i1;vf4;.exit.i" + +bb20: ; preds = %bb9 + br label %"Distribute(i1;vf4;.exit.i" + +"Distribute(i1;vf4;.exit.i": ; preds = %bb20, %bb19, %bb18, %bb17, %bb16, %bb14, %bb12, %bb11, %bb10 + switch i32 undef, label %bb21 [ + i32 0, label %bb22 + i32 1, label %bb23 + i32 2, label %bb24 + i32 3, label %bb29 + i32 4, label %bb30 + i32 5, label %bb31 + ] + +bb21: ; preds = %"Distribute(i1;vf4;.exit.i" + br label %"EmitParticle(i1;i1;f1;.exit" + +bb22: ; preds = %"Distribute(i1;vf4;.exit.i" + br label %"EmitParticle(i1;i1;f1;.exit" + +bb23: ; preds = %"Distribute(i1;vf4;.exit.i" + br label %"EmitParticle(i1;i1;f1;.exit" + +bb24: ; preds = %"Distribute(i1;vf4;.exit.i" + br i1 undef, label %bb25, label %bb26 + +bb25: ; preds = %bb24 + br label %"EmitParticle(i1;i1;f1;.exit" + +bb26: ; preds = %bb24 + br i1 undef, label %bb27, label %bb28 + +bb27: ; preds = %bb26 + br label %"EmitParticle(i1;i1;f1;.exit" + +bb28: ; preds = %bb26 + br label %"EmitParticle(i1;i1;f1;.exit" + +bb29: ; preds = %"Distribute(i1;vf4;.exit.i" + br label %"EmitParticle(i1;i1;f1;.exit" + +bb30: ; preds = %"Distribute(i1;vf4;.exit.i" + br label %"EmitParticle(i1;i1;f1;.exit" + +bb31: ; preds = %"Distribute(i1;vf4;.exit.i" + br label %"EmitParticle(i1;i1;f1;.exit" + +"EmitParticle(i1;i1;f1;.exit": ; preds = %bb31, %bb30, %bb29, %bb28, %bb27, %bb25, %bb23, %bb22, %bb21 + br label %bb32 + +bb32: ; preds = %"EmitParticle(i1;i1;f1;.exit", %._crit_edge + br i1 undef, label %"SimulateParticle(i1;f1;.exit", label %bb33 + +bb33: ; preds = %bb32 + %tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %arg, i32 0, i1 false) #1 + %tmp34 = bitcast <4 x i32> %tmp to <4 x float> + %tmp35 = shl i32 %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract, 4 + %tmp36 = and i32 %tmp35, 131056 + %tmp37 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> undef, i32 %tmp36, i1 false) #1 + %tmp38 = bitcast <4 x i32> %tmp37 to <4 x float> + %tmp39 = fadd reassoc nnan arcp contract <4 x float> %tmp34, %tmp38 + %tmp40 = fmul reassoc nnan arcp contract float undef, 0x3F20000000000000 + %tmp41 = tail call float @llvm.floor.f32(float %tmp40) #1 + %tmp42 = fmul reassoc nnan arcp contract float %tmp41, 0x3FBF972480000000 + %tmp43 = insertelement <4 x float> undef, float %tmp42, i32 0 + %tmp44 = shufflevector <4 x float> %tmp43, <4 x float> undef, <4 x i32> zeroinitializer + %tmp45 = fadd reassoc nnan arcp contract <4 x float> %tmp44, %tmp39 + %x0.i41 = extractelement <4 x float> %tmp45, i32 0 + %tmp46 = call float @llvm.amdgcn.fract.f32(float %x0.i41) #1 + %tmp47 = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + %tmp48 = shufflevector <4 x float> %tmp47, <4 x float> undef, <3 x i32> + br i1 undef, label %bb49, label %bb54 + +bb49: ; preds = %bb33 + br i1 undef, label %bb63, label %bb50 + +bb50: ; preds = %bb49 + br i1 undef, label %bb51, label %bb63 + +bb51: ; preds = %bb50 + br i1 undef, label %bb52, label %bb53 + +bb52: ; preds = %bb51 + br label %bb53 + +bb53: ; preds = %bb52, %bb51 + br label %bb63 + +bb54: ; preds = %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618", %bb33 + %velocity.i.0 = phi <3 x float> [ %tmp83, %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618" ], [ %tmp48, %bb33 ] + br i1 undef, label %bb55, label %.critedge + +bb55: ; preds = %bb54 + br i1 undef, label %bb56, label %.critedge + +bb56: ; preds = %bb55 + br i1 undef, label %"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i", label %.critedge + +"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i": ; preds = %bb56 + br i1 undef, label %bb57, label %.critedge + +bb57: ; preds = %"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i" + br i1 undef, label %"OctWrap(vf2;.exit.i.i", label %"NormalOctDecode(vf2;b1;.exit.i" + +"OctWrap(vf2;.exit.i.i": ; preds = %bb57 + br label %"NormalOctDecode(vf2;b1;.exit.i" + +"NormalOctDecode(vf2;b1;.exit.i": ; preds = %"OctWrap(vf2;.exit.i.i", %bb57 + %end.i120.i = fsub <3 x float> %tmp48, undef + %scale50.i = fmul <3 x float> undef, %end.i120.i + %tmp58 = fsub reassoc nnan arcp contract <3 x float> %scale50.i, undef + br i1 undef, label %bb59, label %.critedge + +bb59: ; preds = %"NormalOctDecode(vf2;b1;.exit.i" + br i1 undef, label %bb60, label %.critedge + +bb60: ; preds = %bb59 + br label %.critedge + +.critedge: ; preds = %bb60, %bb59, %"NormalOctDecode(vf2;b1;.exit.i", %"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i", %bb56, %bb55, %bb54 + %velocity.i.1 = phi <3 x float> [ %tmp58, %bb60 ], [ %tmp58, %bb59 ], [ %velocity.i.0, %"GetLinearDepth(f1;vf4;f1;b1;.exit.i.i" ], [ %velocity.i.0, %bb55 ], [ %velocity.i.0, %bb56 ], [ %velocity.i.0, %bb54 ], [ %tmp58, %"NormalOctDecode(vf2;b1;.exit.i" ] + %tmp61 = shufflevector <3 x float> %velocity.i.1, <3 x float> undef, <2 x i32> + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %tmp61, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + br i1 undef, label %bb62, label %"SimulateParticle(i1;f1;.exit" + +bb62: ; preds = %.critedge + br label %"SimulateParticle(i1;f1;.exit" + +"SimulateParticle(i1;f1;.exit": ; preds = %bb62, %.critedge, %bb32, %bb7, %bb6, %bb5, %bb2 + ret void + +bb63: ; preds = %bb53, %bb50, %bb49 + %param6.i.1 = phi <3 x float> [ undef, %bb53 ], [ %tmp48, %bb50 ], [ %tmp48, %bb49 ] + br i1 undef, label %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618", label %bb64 + +bb64: ; preds = %bb63 + br i1 undef, label %bb65, label %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618" + +bb65: ; preds = %bb64 + br i1 undef, label %bb66, label %bb67 + +bb66: ; preds = %bb65 + br label %bb67 + +bb67: ; preds = %bb66, %bb65 + br label %"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618" + +"ApplyVectorFieldForces(vf4;vf3;vf3;.exit.i618": ; preds = %bb67, %bb64, %bb63 + %param6.i.1.1 = phi <3 x float> [ zeroinitializer, %bb67 ], [ %param6.i.1, %bb64 ], [ %param6.i.1, %bb63 ] + %tmp68 = fmul reassoc nnan arcp contract float %tmp46, 2.000000e+00 + %tmp69 = fadd reassoc nnan arcp contract float %tmp68, -1.000000e+00 + %tmp70 = fmul reassoc nnan arcp contract float %tmp69, undef + %tmp71 = fadd reassoc nnan arcp contract float %tmp70, undef + %tmp72 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %arg, i32 144, i1 false) #1 + %tmp73 = bitcast <4 x i32> %tmp72 to <4 x float> + %tmp74 = shufflevector <4 x float> %tmp73, <4 x float> undef, <3 x i32> + %tmp75 = fdiv float 1.000000e+00, %tmp71 + %tmp76 = fmul float undef, %tmp75 + %tmp77 = fmul float undef, %tmp75 + %tmp78 = fmul float undef, %tmp75 + %tmp79 = insertelement <3 x float> undef, float %tmp76, i32 0 + %tmp80 = insertelement <3 x float> %tmp79, float %tmp77, i32 1 + %tmp81 = insertelement <3 x float> %tmp80, float %tmp78, i32 2 + %tmp82 = fadd reassoc nnan arcp contract <3 x float> %tmp74, %tmp81 + %scale.i = fmul <3 x float> undef, %tmp82 + %tmp83 = fadd reassoc nnan arcp contract <3 x float> %param6.i.1.1, %scale.i + br label %bb54 +} + +; Function Attrs: nounwind readnone speculatable +declare float @llvm.amdgcn.fract.f32(float) #0 + +; Function Attrs: nounwind readonly +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #2 + +; Function Attrs: nounwind readnone +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i1) #3 + +; Function Attrs: nounwind writeonly +declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #4 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind readnone } +attributes #4 = { nounwind writeonly } +