Index: llvm/trunk/include/llvm/CodeGen/MachineScheduler.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/MachineScheduler.h +++ llvm/trunk/include/llvm/CodeGen/MachineScheduler.h @@ -1019,8 +1019,7 @@ const TargetRegisterInfo *TRI); std::unique_ptr -createMacroFusionDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI); +createMacroFusionDAGMutation(const TargetInstrInfo *TII); std::unique_ptr createCopyConstrainDAGMutation(const TargetInstrInfo *TII, Index: llvm/trunk/lib/CodeGen/MachineScheduler.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachineScheduler.cpp +++ llvm/trunk/lib/CodeGen/MachineScheduler.cpp @@ -1501,10 +1501,9 @@ /// that may be fused by the processor into a single operation. class MacroFusion : public ScheduleDAGMutation { const TargetInstrInfo &TII; - const TargetRegisterInfo &TRI; public: - MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) - : TII(TII), TRI(TRI) {} + MacroFusion(const TargetInstrInfo &TII) + : TII(TII) {} void apply(ScheduleDAGInstrs *DAGInstrs) override; }; @@ -1513,27 +1512,12 @@ namespace llvm { std::unique_ptr -createMacroFusionDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) { - return make_unique(*TII, *TRI); +createMacroFusionDAGMutation(const TargetInstrInfo *TII) { + return make_unique(*TII); } } // namespace llvm -/// Returns true if \p MI reads a register written by \p Other. -static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI, - const MachineInstr &Other) { - for (const MachineOperand &MO : MI.uses()) { - if (!MO.isReg() || !MO.readsReg()) - continue; - - unsigned Reg = MO.getReg(); - if (Other.modifiesRegister(Reg, &TRI)) - return true; - } - return false; -} - /// \brief Callback from DAG postProcessing to create cluster edges to encourage /// fused operations. void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { @@ -1545,16 +1529,12 @@ if (!Branch) return; - for (SUnit &SU : DAG->SUnits) { - // SUnits with successors can't be schedule in front of the ExitSU. - if (!SU.Succs.empty()) - continue; - // We only care if the node writes to a register that the branch reads. - MachineInstr *Pred = SU.getInstr(); - if (!HasDataDep(TRI, *Branch, *Pred)) + for (SDep &PredDep : ExitSU.Preds) { + if (PredDep.isWeak()) continue; - - if (!TII.shouldScheduleAdjacent(*Pred, *Branch)) + SUnit &SU = *PredDep.getSUnit(); + MachineInstr &Pred = *SU.getInstr(); + if (!TII.shouldScheduleAdjacent(Pred, *Branch)) continue; // Create a single weak edge from SU to ExitSU. The only effect is to cause @@ -1567,6 +1547,16 @@ (void)Success; assert(Success && "No DAG nodes should be reachable from ExitSU"); + // Adjust latency of data deps between the nodes. + for (SDep &PredDep : ExitSU.Preds) { + if (PredDep.getSUnit() == &SU) + PredDep.setLatency(0); + } + for (SDep &SuccDep : SU.Succs) { + if (SuccDep.getSUnit() == &ExitSU) + SuccDep.setLatency(0); + } + DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); break; } @@ -3128,7 +3118,7 @@ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); } if (EnableMacroFusion) - DAG->addMutation(createMacroFusionDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); return DAG; } Index: llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp =================================================================== --- llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp +++ llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -247,11 +247,8 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr; ExitSU.setInstr(ExitMI); - bool AllDepKnown = ExitMI && - (ExitMI->isCall() || ExitMI->isBarrier()); - if (ExitMI && AllDepKnown) { - // If it's a call or a barrier, add dependencies on the defs and uses of - // instruction. + // Add dependencies on the defs and uses of the instruction. + if (ExitMI) { for (const MachineOperand &MO : ExitMI->operands()) { if (!MO.isReg() || MO.isDef()) continue; unsigned Reg = MO.getReg(); @@ -261,10 +258,10 @@ addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO)); } } - } else { + } + if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) { // For others, e.g. fallthrough, conditional branch, assume the exit // uses all the registers that are livein to the successor blocks. - assert(Uses.empty() && "Uses in set before adding deps?"); for (const MachineBasicBlock *Succ : BB->successors()) { for (const auto &LI : Succ->liveins()) { if (!Uses.contains(LI.PhysReg)) Index: llvm/trunk/test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -78,8 +78,8 @@ ; Next BB. ; CHECK: [[LOOP:LBB[0-9_]+]]: ; %for.body ; CHECK: bl _something -; CHECK-NEXT: add [[SUM]], w0, [[SUM]] ; CHECK-NEXT: sub [[IV]], [[IV]], #1 +; CHECK-NEXT: add [[SUM]], w0, [[SUM]] ; CHECK-NEXT: cbnz [[IV]], [[LOOP]] ; ; Next BB. @@ -144,8 +144,8 @@ ; Next BB. ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body ; CHECK: bl _something -; CHECK-NEXT: add [[SUM]], w0, [[SUM]] ; CHECK-NEXT: sub [[IV]], [[IV]], #1 +; CHECK-NEXT: add [[SUM]], w0, [[SUM]] ; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]] ; Next BB. ; CHECK: ; %for.end @@ -188,8 +188,8 @@ ; ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body ; CHECK: bl _something -; CHECK-NEXT: add [[SUM]], w0, [[SUM]] ; CHECK-NEXT: sub [[IV]], [[IV]], #1 +; CHECK-NEXT: add [[SUM]], w0, [[SUM]] ; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]] ; Next BB. ; CHECK: bl _somethingElse @@ -259,8 +259,8 @@ ; ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body ; CHECK: bl _something -; CHECK-NEXT: add [[SUM]], w0, [[SUM]] ; CHECK-NEXT: sub [[IV]], [[IV]], #1 +; CHECK-NEXT: add [[SUM]], w0, [[SUM]] ; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]] ; Next BB. ; CHECK: lsl w0, [[SUM]], #3 @@ -333,32 +333,32 @@ ; ; Sum is merged with the returned register. ; CHECK: add [[VA_BASE:x[0-9]+]], sp, #16 -; CHECK-NEXT: str [[VA_BASE]], [sp, #8] ; CHECK-NEXT: cmp w1, #1 +; CHECK-NEXT: str [[VA_BASE]], [sp, #8] +; CHECK-NEXT: mov [[SUM:w0]], wzr ; CHECK-NEXT: b.lt [[IFEND_LABEL:LBB[0-9_]+]] -; CHECK: mov [[SUM:w0]], wzr ; ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body ; CHECK: ldr [[VA_ADDR:x[0-9]+]], [sp, #8] ; CHECK-NEXT: add [[NEXT_VA_ADDR:x[0-9]+]], [[VA_ADDR]], #8 ; CHECK-NEXT: str [[NEXT_VA_ADDR]], [sp, #8] ; CHECK-NEXT: ldr [[VA_VAL:w[0-9]+]], {{\[}}[[VA_ADDR]]] -; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]] ; CHECK-NEXT: sub w1, w1, #1 +; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]] ; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]] +; DISABLE-NEXT: b [[IFEND_LABEL]] ; -; DISABLE-NEXT: b ; DISABLE: [[ELSE_LABEL]]: ; %if.else ; DISABLE: lsl w0, w1, #1 ; -; ENABLE: [[ELSE_LABEL]]: ; %if.else -; ENABLE: lsl w0, w1, #1 -; ENABLE-NEXT: ret -; ; CHECK: [[IFEND_LABEL]]: ; Epilogue code. ; CHECK: add sp, sp, #16 ; CHECK-NEXT: ret +; +; ENABLE: [[ELSE_LABEL]]: ; %if.else +; ENABLE-NEXT: lsl w0, w1, #1 +; ENABLE_NEXT: ret define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 { entry: %ap = alloca i8*, align 8 @@ -413,9 +413,9 @@ ; ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body ; Inline asm statement. -; CHECK: add x19, x19, #1 ; CHECK: sub [[IV]], [[IV]], #1 -; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]] +; CHECK: add x19, x19, #1 +; CHECK: cbnz [[IV]], [[LOOP_LABEL]] ; Next BB. ; CHECK: mov w0, wzr ; Epilogue code. Index: llvm/trunk/test/CodeGen/AArch64/misched-fusion.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/misched-fusion.ll +++ llvm/trunk/test/CodeGen/AArch64/misched-fusion.ll @@ -1,4 +1,4 @@ -; RUN: llc -o - %s -mattr=+arith-cbz-fusion,+use-postra-scheduler | FileCheck %s +; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s target triple = "arm64-apple-ios" Index: llvm/trunk/test/CodeGen/AArch64/neg-imm.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/neg-imm.ll +++ llvm/trunk/test/CodeGen/AArch64/neg-imm.ll @@ -30,9 +30,9 @@ for.inc: ; CHECK_LABEL: %for.inc -; CHECK: add -; CHECK-NEXT: cmp -; CHECK: b.le +; CHECK: cmp +; CHECK-NEXT: add +; CHECK-NEXT: b.le ; CHECK_LABEL: %for.cond.cleanup %inc = add nsw i32 %x.015, 1 %cmp1 = icmp sgt i32 %x.015, %px Index: llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll +++ llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -171,6 +171,7 @@ ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]] ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]] +; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 @@ -178,7 +179,6 @@ ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]] ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2 @@ -426,6 +426,8 @@ ; GCN-NEXT: s_setpc_b64 vcc ; GCN-NEXT: [[LOOP_BODY]]: ; %loop_body +; GCN: s_mov_b64 vcc, -1{{$}} +; GCN: ;;#ASMSTART ; GCN: v_nop_e64 ; GCN: v_nop_e64 ; GCN: v_nop_e64 @@ -433,7 +435,6 @@ ; GCN: v_nop_e64 ; GCN: v_nop_e64 ; GCN: ;;#ASMEND -; GCN-NEXT: s_mov_b64 vcc, -1{{$}} ; GCN-NEXT: s_cbranch_vccz [[RET]] ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body @@ -493,6 +494,7 @@ ; GCN: [[LONG_BR_DEST0]] ; GCN: s_cmp_eq_u32 +; GCN-NEXT: ; implicit-def ; GCN-NEXT: s_cbranch_scc0 ; GCN: s_setpc_b64 Index: llvm/trunk/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ llvm/trunk/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}vcc_shrink_vcc_def: ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc -; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}} +; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}} define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) { bb0: %tmp = icmp sgt i32 %arg1, 4 @@ -34,7 +34,7 @@ ; GCN-LABEL: {{^}}preserve_condition_undef_flag: ; GCN-NOT: vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc -; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}} +; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}} define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { bb0: %tmp = icmp sgt i32 %arg1, 4 Index: llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -78,6 +78,8 @@ ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} +; IDXMODE: v_mov_b32_e32 v2, 2 +; IDXMODE: v_mov_b32_e32 v3, 3 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off @@ -95,6 +97,10 @@ ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} +; IDXMODE: v_mov_b32_e32 v0, +; IDXMODE: v_mov_b32_e32 v1, +; IDXMODE: v_mov_b32_e32 v2, +; IDXMODE: v_mov_b32_e32 v3, ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off @@ -572,12 +578,12 @@ ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000 ; GCN-DAG: s_load_dword [[ARG:s[0-9]+]] +; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16 ; MOVREL-DAG: s_add_i32 m0, [[ARG]], -16 ; MOVREL: v_movreld_b32_e32 v[[VEC0_ELT0]], 4.0 ; GCN-NOT: m0 -; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16 ; IDXMODE: s_set_gpr_idx_on [[ARG_ADD]], dst ; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT0]], 4.0 ; IDXMODE: s_set_gpr_idx_off Index: llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll +++ llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -138,6 +138,7 @@ ; CHECK-LABEL: {{^}}test_kill_control_flow_remainder: ; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; CHECK-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; BB#1: ; %bb Index: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll @@ -213,8 +213,8 @@ ;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: image_sample -;CHECK: store ;CHECK: v_cmp +;CHECK: store define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) Index: llvm/trunk/test/CodeGen/ARM/arm-shrink-wrapping.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ llvm/trunk/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -644,6 +644,7 @@ ; CHECK: push ; ; DISABLE: tst{{(\.w)?}} r2, #1 +; DISABLE-NEXT: vst1.64 ; DISABLE-NEXT: beq [[BB13:LBB[0-9_]+]] ; ; CHECK: bl{{x?}} _pow Index: llvm/trunk/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll +++ llvm/trunk/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll @@ -210,6 +210,8 @@ ; CHECK: mflr {{[0-9]+}} ; ; DISABLE: cmplwi 0, 3, 0 +; DISABLE-NEXT: std +; DISABLE-NEXT: std ; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]] ; ; Loop preheader @@ -290,6 +292,8 @@ ; CHECK: mflr {{[0-9]+}} ; ; DISABLE: cmplwi 0, 3, 0 +; DISABLE-NEXT: std +; DISABLE-NEXT: std ; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]] ; ; CHECK: bl somethingElse @@ -377,8 +381,8 @@ ; ENABLE-DAG: li [[IV:[0-9]+]], 10 ; ENABLE-DAG: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill ; -; DISABLE: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill ; DISABLE: cmplwi 0, 3, 0 +; DISABLE-NEXT: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill ; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]] ; DISABLE: li [[IV:[0-9]+]], 10 ; Index: llvm/trunk/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll =================================================================== --- llvm/trunk/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll +++ llvm/trunk/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll @@ -56,9 +56,9 @@ define i32 @test_inlineasm(i32 %a) nounwind { entry: ;CHECK-LABEL: test_inlineasm: +;CHECK: cmp ;CHECK: sethi ;CHECK: !NO_APP -;CHECK-NEXT: cmp ;CHECK-NEXT: ble ;CHECK-NEXT: mov tail call void asm sideeffect "sethi 0, %g0", ""() nounwind Index: llvm/trunk/test/CodeGen/SystemZ/int-cmp-48.ll =================================================================== --- llvm/trunk/test/CodeGen/SystemZ/int-cmp-48.ll +++ llvm/trunk/test/CodeGen/SystemZ/int-cmp-48.ll @@ -29,8 +29,8 @@ define void @f2(i8 *%src) { ; CHECK-LABEL: f2: ; CHECK: llc [[REG:%r[0-5]]], 0(%r2) -; CHECK: mvi 0(%r2), 0 ; CHECK: tmll [[REG]], 1 +; CHECK: mvi 0(%r2), 0 ; CHECK: ber %r14 ; CHECK: br %r14 entry: