Index: llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -76,7 +76,7 @@ static cl::opt RemoveRedundantEndcf("amdgpu-remove-redundant-endcf", - cl::init(false), cl::ReallyHidden); + cl::init(true), cl::ReallyHidden); namespace { @@ -87,6 +87,7 @@ LiveIntervals *LIS = nullptr; MachineRegisterInfo *MRI = nullptr; DenseSet LoweredEndCf; + DenseSet LoweredIf; const TargetRegisterClass *BoolRC = nullptr; unsigned AndOpc; @@ -212,6 +213,7 @@ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) .addReg(Exec) .addReg(Exec, RegState::ImplicitDefine); + LoweredIf.insert(CopyReg); Register Tmp = MRI->createVirtualRegister(BoolRC); @@ -453,11 +455,19 @@ skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI.getIterator())); if (Next != MBB.end() && (Next->getOpcode() == AMDGPU::SI_END_CF || LoweredEndCf.count(&*Next))) { - LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump()); - if (LIS) - LIS->RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); - return; + // Only skip inner END_CF if outer ENDCF belongs to SI_IF. + // If that belongs to SI_ELSE then saved mask has an inverted value. + Register SavedExec = Next->getOperand(0).getReg(); + const MachineInstr *Def = MRI.getUniqueVRegDef(SavedExec); + // A lowered SI_IF turns definition into COPY of exec. + if (Def && (Def->getOpcode() == AMDGPU::SI_IF || + LoweredIf.count(SavedExec))) { + LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump()); + if (LIS) + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + return; + } } } @@ -617,6 +627,7 @@ } LoweredEndCf.clear(); + LoweredIf.clear(); return true; } Index: llvm/test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -142,16 +142,15 @@ ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]] ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword -; GCN-NEXT: s_and_b64 exec, exec, +; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_ELSE:s\[[0-9:]+\]]], ; GCN-NEXT: s_cbranch_execz [[FLOW1:BB[0-9_]+]] ; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: [[FLOW1]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]] -; GCN-NOT: s_or_b64 exec -; GCN-NOT: {{^.*:}} -; GCN: ds_write_b32 -; GCN: s_endpgm +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_ELSE]] +; GCN: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]] +; GCN: ds_write_b32 +; GCN: s_endpgm define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/collapse-endcf.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -834,3 +834,124 @@ SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.1 + +... + +# Both s_or_b64 shall be preserved since the outer SI_END_CF belongs to SI_ELSE. + +--- +name: simple_outer_if_else +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: simple_outer_if_else + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 1, [[COPY1]], implicit $exec + ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc + ; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc + ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; GCN: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN: bb.1: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: S_BRANCH %bb.2 + ; GCN: bb.2: + ; GCN: successors: %bb.3(0x40000000), %bb.6(0x40000000) + ; GCN: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[S_XOR_B64_]] + ; GCN: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[COPY3]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec + ; GCN: bb.3: + ; GCN: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GCN: undef %5.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GCN: undef %6.sub0:vreg_64 = V_LSHLREV_B32_e32 2, [[COPY1]], implicit $exec + ; GCN: %6.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %5.sub1 + ; GCN: undef %8.sub0:vreg_64, %9:sreg_64_xexec = V_ADD_I32_e64 %5.sub0, %6.sub0, 0, implicit $exec + ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY4]], %9, 0, implicit $exec + ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 + ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec + ; GCN: [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY5]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] + ; GCN: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN: bb.4: + ; GCN: successors: %bb.5(0x80000000) + ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 + ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: bb.5: + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64 $exec, [[COPY5]], implicit-def $scc + ; GCN: bb.6: + ; GCN: $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $m0 = S_MOV_B32 -1 + ; GCN: DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) + ; GCN: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %2:sreg_64 = V_CMP_LT_U32_e64 1, %0, implicit $exec + %3:sreg_64 = SI_IF %2:sreg_64, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3, %bb.6 + %4:sreg_64 = SI_ELSE %3:sreg_64, %bb.6, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.3: + successors: %bb.3, %bb.4 + + undef %5.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1, 9, 0, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + undef %6.sub0:vreg_64 = V_LSHLREV_B32_e32 2, %0, implicit $exec + %6.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec + %7:vgpr_32 = COPY %5.sub1 + undef %8.sub0:vreg_64, %9:sreg_64_xexec = V_ADD_I32_e64 %5.sub0, %6.sub0, 0, implicit $exec + %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec + %5.sub3:sgpr_128 = S_MOV_B32 61440 + %5.sub2:sgpr_128 = S_MOV_B32 0 + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec + %12:sreg_64 = SI_IF %11:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.4: + successors: %bb.5 + + %5.sub0:sgpr_128 = COPY %5.sub2 + %5.sub1:sgpr_128 = COPY %5.sub2 + %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + + bb.5: + successors: %bb.6 + + SI_END_CF %12:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.6: + SI_END_CF %4:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + $m0 = S_MOV_B32 -1 + DS_WRITE_B32 %16, %15, 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -58,7 +58,7 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 ; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] -; GFX9-NEXT: s_cbranch_execz BB1_4 +; GFX9-NEXT: s_cbranch_execz BB1_3 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6 @@ -100,9 +100,7 @@ ; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13] ; GFX9-NEXT: s_cbranch_execnz BB1_2 -; GFX9-NEXT: ; %bb.3: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[12:13] -; GFX9-NEXT: BB1_4: ; %Flow3 +; GFX9-NEXT: BB1_3: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31]