diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -368,7 +368,13 @@ // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let isTerminator = 1 in { +// As we have enhanced control flow intrinsics to work under unstructured CFG, +// duplicating such intrinsics can be actually treated as legal. On the contrary, +// by making them non-duplicable, we are observing better code generation result. +// So we choose to mark them non-duplicable in hope of getting better code +// generation as well as simplied CFG during Machine IR optimization stage. + +let isTerminator = 1, isNotDuplicable = 1 in { let OtherPredicates = [EnableLateCFGStructurize] in { def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < @@ -418,6 +424,7 @@ let isAsCheapAsAMove = 1; let isReMaterializable = 1; let hasSideEffects = 1; + let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. let mayLoad = 1; // FIXME: Should not need memory flags let mayStore = 1; } @@ -425,6 +432,7 @@ def SI_IF_BREAK : CFPseudoInstSI < (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { let Size = 4; + let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. let isAsCheapAsAMove = 1; let isReMaterializable = 1; } diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -18,12 +18,12 @@ ; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s2, s0, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB0_5 ; GFX10-NEXT: .LBB0_2: ; %bb4 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s2, s0, s2 ; GFX10-NEXT: s_and_saveexec_b32 s3, s1 ; GFX10-NEXT: s_cbranch_execz .LBB0_1 ; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir @@ -0,0 +1,73 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: stop_duplicate_cfg_intrinsic +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: stop_duplicate_cfg_intrinsic + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.3 + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[PHI]], [[COPY2]], 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_CO_U32_e64_]], %bb.2, [[PHI]], %bb.1 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET [[PHI1]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + bb.1: + liveins: $vgpr0 + + %0:vgpr_32 = COPY $vgpr0 + %12:sreg_64 = IMPLICIT_DEF + %4:sreg_32 = S_MOV_B32 0 + %14:vgpr_32 = COPY %4:sreg_32 + %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0:vgpr_32, %14:vgpr_32, implicit $exec + + bb.2: + %6:vgpr_32 = PHI %4:sreg_32, %bb.1, %11:vgpr_32, %bb.4 + %8:sreg_64_xexec = SI_IF %5:sreg_64_xexec, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + S_BRANCH %bb.4 + + bb.3: + SI_END_CF %8:sreg_64_xexec, implicit-def $exec, implicit-def $scc, implicit $exec + %13:sreg_32 = S_MOV_B32 1 + %15:vgpr_32 = COPY %13:sreg_32 + %10:vgpr_32, dead %20:sreg_64_xexec = V_ADD_CO_U32_e64 %6:vgpr_32, %15:vgpr_32, 0, implicit $exec + + bb.4: + %11:vgpr_32 = PHI %10:vgpr_32, %bb.3, %6:vgpr_32, %bb.2 + %16:sreg_32 = S_MOV_B32 4294967295 + %17:sreg_32 = S_MOV_B32 61440 + %18:sreg_64 = REG_SEQUENCE %16:sreg_32, %subreg.sub0, %17:sreg_32, %subreg.sub1 + %19:sgpr_128 = REG_SEQUENCE %12:sreg_64, %subreg.sub0_sub1, %18:sreg_64, %subreg.sub2_sub3 + BUFFER_STORE_DWORD_OFFSET %11:vgpr_32, %19:sgpr_128, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.2 + +...