Index: lib/CodeGen/MachineCSE.cpp =================================================================== --- lib/CodeGen/MachineCSE.cpp +++ lib/CodeGen/MachineCSE.cpp @@ -473,11 +473,34 @@ } // Heuristics #3: If the common subexpression is used by PHIs, do not reuse - // it unless the defined value is already used in the BB of the new use. + // it unless: + // a) the defined value is already used in the BB of the new use or + // b) all uses of the defined value are in the BB whose only successor + // contains the new use + + // First check for BB containing all uses + MachineBasicBlock *BBUses = nullptr; + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) { + if (BBUses && BBUses != UseMI.getParent()) { + BBUses = nullptr; + break; + } + if (!BBUses) { + if (UseMI.getParent()->succ_size() != 1) + break; + BBUses = UseMI.getParent(); + } + } + bool HasPHI = false; - for (MachineInstr &UseMI : MRI->use_nodbg_instructions(CSReg)) { - HasPHI |= UseMI.isPHI(); - if (UseMI.getParent() == MI->getParent()) + for (MachineInstr &UseCSMI : MRI->use_nodbg_instructions(CSReg)) { + HasPHI |= UseCSMI.isPHI(); + // a) the defined value is already used in the BB of the new use + if (UseCSMI.getParent() == MI->getParent()) + return true; + + // b) the BB's only successor contains the new use + if (BBUses && UseCSMI.getParent() == *BBUses->succ_begin()) return true; } Index: test/CodeGen/AMDGPU/cse-phi-incoming-val.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/cse-phi-incoming-val.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck %s + +; Check that the redundant immediate MOV instruction +; (by-product of handling phi nodes) is not found +; in the generated code due to CSE heuristic. + +; CHECK-LABEL: {{^}}mov_opt: +; CHECK: v_mov_b32_e32 {{v[0-9]+}}, 1.0 +; CHECK: %bb.1: +; CHECK-NOT: v_mov_b32_e32 {{v[0-9]+}}, 1.0 +; CHECK: BB0_2: + +define amdgpu_ps void @mov_opt(i32 %arg, i32 inreg %arg1, i32 inreg %arg2) local_unnamed_addr #0 { +bb: + %tmp = icmp eq i32 %arg1, 0 + br i1 %tmp, label %bb3, label %bb10 + +bb3: ; preds = %bb + %tmp4 = icmp eq i32 %arg2, 0 + br i1 %tmp4, label %bb5, label %bb10 + +bb5: ; preds = %bb3 + %tmp6 = getelementptr <{ [4294967295 x i32] }>, <{ [4294967295 x i32] }> addrspace(6)* null, i32 0, i32 0, i32 %arg + %tmp7 = load i32, i32 addrspace(6)* %tmp6 + %tmp8 = icmp eq i32 %tmp7, 1 + br i1 %tmp8, label %bb10, label %bb9 + +bb9: ; preds = %bb5 + br label %bb10 + +bb10: ; preds = %bb9, %bb5, %bb3, %bb + %tmp11 = phi float [ 1.000000e+00, %bb3 ], [ 0.000000e+00, %bb9 ], [ 1.000000e+00, %bb ], [ undef, %bb5 ] + call void @llvm.amdgcn.exp.f32(i32 immarg 40, i32 immarg 15, float %tmp11, float undef, float undef, float undef, i1 immarg false, i1 immarg false) #0 + ret void +} + +; Function Attrs: inaccessiblememonly nounwind +declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #1 + +attributes #0 = { nounwind } +attributes #1 = { inaccessiblememonly nounwind } Index: test/CodeGen/AMDGPU/cse-phi-incoming-val.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/cse-phi-incoming-val.mir @@ -0,0 +1,73 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass machine-cse -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# Check that the redundant immediate MOV instruction +# (by-product of handling phi nodes) is not found +# in the generated code due to CSE heuristic. + +# GCN-LABEL: name: cse_phi_incoming_val +# GCN: bb.0: +# GCN: V_MOV_B32_e32 1065353216 +# GCN: bb.1: +# GCN-NOT: V_MOV_B32_e32 1065353216 +# GCN: bb.2: + +--- +name: cse_phi_incoming_val +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.6 + liveins: $vgpr0, $sgpr0, $sgpr1 + %5:sgpr_32 = COPY $sgpr1 + %4:sgpr_32 = COPY $sgpr0 + %3:vgpr_32 = COPY $vgpr0 + %7:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + %8:sreg_32_xm0 = S_MOV_B32 0 + S_CMP_LG_U32 %4:sgpr_32, killed %8:sreg_32_xm0, implicit-def $scc + S_CBRANCH_SCC1 %bb.6, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.5 + + %9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + %10:sreg_32_xm0 = S_MOV_B32 0 + S_CMP_LG_U32 %5:sgpr_32, killed %10:sreg_32_xm0, implicit-def $scc + S_CBRANCH_SCC1 %bb.5, implicit $scc + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3, %bb.4 + + %11:sreg_32_xm0 = S_MOV_B32 2 + %12:vgpr_32 = V_LSHLREV_B32_e64 %11:sreg_32_xm0, %3:vgpr_32, implicit $exec + + %17:sreg_64 = V_CMP_NE_U32_e64 killed %11:sreg_32_xm0, %12:vgpr_32, implicit $exec + + %0:sreg_64 = SI_IF killed %17:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + + bb.4: + successors: %bb.5 + + SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %19:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + bb.5: + successors: %bb.6 + + %1:vgpr_32 = PHI %9:vgpr_32, %bb.1, %19:vgpr_32, %bb.4 + + bb.6: + + %2:vgpr_32 = PHI %7:vgpr_32, %bb.0, %1:vgpr_32, %bb.5 + %20:vgpr_32 = IMPLICIT_DEF + %21:vgpr_32 = IMPLICIT_DEF + %22:vgpr_32 = IMPLICIT_DEF + EXP 40, %2:vgpr_32, %20:vgpr_32, %21:vgpr_32, %22:vgpr_32, 0, 0, 15, implicit $exec + S_ENDPGM 0 +--- + Index: test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- test/CodeGen/AMDGPU/multilevel-break.ll +++ test/CodeGen/AMDGPU/multilevel-break.ll @@ -100,9 +100,9 @@ ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}} ; GCN: s_mov_b64 [[OLD_LEFT]], [[LEFT]] +; GCN: s_mov_b64 ; GCN: ; %LeafBlock1 -; GCN: s_mov_b64 ; GCN: s_mov_b64 [[BREAK]], -1{{$}} ; GCN: ; %case1