Index: lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp =================================================================== --- lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -111,9 +111,64 @@ const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); LiveIntervals *LIS = &getAnalysis(); + DenseSet RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI}); bool Changed = false; for (MachineBasicBlock &MBB : MF) { + + // Try to remove unneeded instructions before s_endpgm. + if (MBB.succ_size() == 0) { + if (MBB.empty() || MBB.back().getOpcode() != AMDGPU::S_ENDPGM) + continue; + + SmallVector Blocks({&MBB}); + + while (!Blocks.empty()) { + auto CurBB = Blocks.pop_back_val(); + auto I = CurBB->rbegin(), E = CurBB->rend(); + if (I != E) { + if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM) + ++I; + else if (I->isBranch()) + continue; + } + + while (I != E) { + if (I->isDebugValue()) + continue; + if (I->mayStore() || I->isBarrier() || I->isCall() || + I->hasUnmodeledSideEffects() || + (I->mayLoad() && (!I->hasOneMemOperand() || + (*I->memoperands_begin())->isVolatile() || + (*I->memoperands_begin())->isAtomic()))) + break; + + DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n'); + + for (auto &Op : I->operands()) + if (Op.isReg()) + RecalcRegs.insert(Op.getReg()); + + auto Next = std::next(I); + LIS->RemoveMachineInstrFromMaps(*I); + I->eraseFromParent(); + I = Next; + + Changed = true; + } + + if (I != E) + continue; + + // Try to ascend predecessors. + for (auto *Pred : CurBB->predecessors()) + if (Pred->succ_size() == 1) + Blocks.push_back(Pred); + } + continue; + } + + // Try to collapse adjacent endifs. auto Lead = MBB.begin(), E = MBB.end(); if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI)) continue; @@ -174,9 +229,16 @@ } if (Changed) { - // Recompute liveness for both reg units of exec. - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC_LO, TRI)); - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC_HI, TRI)); + for (auto Reg : RecalcRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + LIS->removeInterval(Reg); + if (!MRI.reg_empty(Reg)) + LIS->createAndComputeVirtRegInterval(Reg); + } else { + for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U) + LIS->removeRegUnit(*U); + } + } } return Changed; Index: test/CodeGen/AMDGPU/branch-condition-and.ll =================================================================== --- test/CodeGen/AMDGPU/branch-condition-and.ll +++ test/CodeGen/AMDGPU/branch-condition-and.ll @@ -20,7 +20,6 @@ ; GCN: ds_write_b32 ; GCN: [[BB5]] -; GCN: s_or_b64 exec, exec ; GCN-NEXT: s_endpgm ; GCN-NEXT: .Lfunc_end define amdgpu_ps void @ham(float %arg, float %arg1) #0 { Index: test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- test/CodeGen/AMDGPU/collapse-endcf.ll +++ test/CodeGen/AMDGPU/collapse-endcf.ll @@ -9,7 +9,6 @@ ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) { bb: @@ -45,7 +44,6 @@ ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER]] ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @uncollapsable_nested_if(i32 addrspace(1)* nocapture %arg) { bb: @@ -90,7 +88,6 @@ ; GCN-NEXT: ; mask branch [[ENDIF_OUTER]] ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @nested_if_if_else(i32 addrspace(1)* nocapture %arg) { bb: @@ -141,13 +138,10 @@ ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword ; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_THEN:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[ENDIF_INNER_OUTER_THEN:BB[0-9_]+]] +; GCN-NEXT: ; mask branch [[ENDIF_OUTER]] ; GCN-NEXT: {{^BB[0-9_]+}}: ; GCN: store_dword -; GCN-NEXT: {{^}}[[ENDIF_INNER_OUTER_THEN]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]] ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: -; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) { bb: @@ -183,6 +177,33 @@ ret void } +; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier: +; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] +; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]] +; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN: store_dword +; GCN-NEXT: {{^}}[[ENDIF]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]] +; GCN: s_barrier +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @s_endpgm_unsafe_barrier(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = icmp ugt i32 %tmp, 1 + br i1 %tmp1, label %bb.then, label %bb.end + +bb.then: ; preds = %bb + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp + store i32 0, i32 addrspace(1)* %tmp4, align 4 + br label %bb.end + +bb.end: ; preds = %bb.then, %bb + call void @llvm.amdgcn.s.barrier() + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare void @llvm.amdgcn.s.barrier() #1 attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind convergent } Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -42,7 +42,6 @@ ; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]] ; GCN: ; BB#4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] ; GCN-NEXT: s_endpgm define amdgpu_kernel void @break_loop(i32 %arg) #0 { bb: Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll =================================================================== --- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -86,7 +86,6 @@ ; GCN: buffer_store_dword ; GCN: ; %UnifiedReturnBlock -; GCN-NEXT: s_or_b64 exec, exec ; GCN-NEXT: s_endpgm define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: Index: test/CodeGen/AMDGPU/shrink-carry.mir =================================================================== --- test/CodeGen/AMDGPU/shrink-carry.mir +++ test/CodeGen/AMDGPU/shrink-carry.mir @@ -21,7 +21,6 @@ %2 = IMPLICIT_DEF %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec %4, %5 = V_SUBBREV_U32_e64 0, %0, %3, implicit %exec - S_ENDPGM ... @@ -46,7 +45,6 @@ %2 = IMPLICIT_DEF %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec %4, %5 = V_SUBB_U32_e64 %0, 0, %3, implicit %exec - S_ENDPGM ... @@ -71,7 +69,6 @@ %2 = IMPLICIT_DEF %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec %4, %5 = V_ADDC_U32_e64 0, %0, %3, implicit %exec - S_ENDPGM ... @@ -96,6 +93,5 @@ %2 = IMPLICIT_DEF %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec %4, %5 = V_ADDC_U32_e64 %0, 0, %3, implicit %exec - S_ENDPGM ... Index: test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll +++ test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll @@ -42,7 +42,6 @@ ; GCN: s_and_saveexec_b64 ; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]] ; GCN-NEXT: [[UNIFIED_RET]]: -; GCN-NEXT: s_or_b64 exec, exec ; GCN-NEXT: s_endpgm ; GCN: .Lfunc_end define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { Index: test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll =================================================================== --- test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -10,7 +10,6 @@ ; GCN: ; divergent unreachable ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock -; GCN-NEXT: s_or_b64 exec, exec ; GCN: s_endpgm define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 { @@ -37,7 +36,6 @@ ; GCN: ; divergent unreachable ; GCN: [[RETURN]]: -; GCN-NEXT: s_or_b64 exec, exec ; GCN-NEXT: s_endpgm define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { bb: Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -354,7 +354,6 @@ ; CHECK: buffer_store_dword ; CHECK: [[END]]: -; CHECK: s_or_b64 exec, exec ; CHECK: s_endpgm define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x float> %arg2) #0 { bb: Index: test/CodeGen/AMDGPU/spill-empty-live-interval.mir =================================================================== --- test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -35,7 +35,6 @@ S_NOP 0, implicit %3.sub1 S_NOP 0, implicit %0.sub1 S_NOP 0, implicit undef %0.sub0 - S_ENDPGM ... Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -330,12 +330,14 @@ ; GCN-LABEL: {{^}}divergent_inside_uniform: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] -; GCN: [[IF_LABEL]]: +; GCN: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]] ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc +; GCN: ; mask branch [[ENDIF_LABEL]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] +; GCN: [[ENDIF_LABEL]]: +; GCN: s_endpgm define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { entry: %u_cmp = icmp eq i32 %cond, 0 Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -71,7 +71,6 @@ ; SI: buffer_store_dword ; SI-NEXT: {{^}}[[EXIT]]: -; SI: s_or_b64 exec, exec, [[BR_SREG]] ; SI: s_endpgm define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -98,7 +97,6 @@ ; SI: buffer_store_dword ; SI-NEXT: {{^}}[[EXIT]]: -; SI: s_or_b64 exec, exec, [[BR_SREG]] ; SI: s_endpgm define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -137,7 +135,6 @@ ; SI-NEXT: buffer_store_dword ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock -; SI: s_or_b64 exec, exec ; SI: s_endpgm define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -230,9 +227,6 @@ ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] -; SI: BB#5 -; SI: s_or_b64 exec, exec, [[COND_STATE]] - ; SI: [[LABEL_EXIT]]: ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm Index: test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll =================================================================== --- test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll +++ test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll @@ -8,15 +8,17 @@ declare void @llvm.write_register.i32(metadata, i32) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 - +declare void @llvm.amdgcn.wave.barrier() #2 define amdgpu_kernel void @write_vgpr_into_sgpr() { %tid = call i32 @llvm.amdgcn.workitem.id.x() call void @llvm.write_register.i32(metadata !0, i32 %tid) + call void @llvm.amdgcn.wave.barrier() #2 ret void } attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +attributes #2 = { convergent nounwind } !0 = !{!"exec_lo"} Index: test/CodeGen/AMDGPU/write_register.ll =================================================================== --- test/CodeGen/AMDGPU/write_register.ll +++ test/CodeGen/AMDGPU/write_register.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched=0 -verify-machineinstrs < %s | FileCheck %s declare void @llvm.write_register.i32(metadata, i32) #0 declare void @llvm.write_register.i64(metadata, i64) #0 @@ -8,6 +8,7 @@ call void @llvm.write_register.i32(metadata !0, i32 0) call void @llvm.write_register.i32(metadata !0, i32 -1) call void @llvm.write_register.i32(metadata !0, i32 %val) + call void @llvm.amdgcn.wave.barrier() #1 ret void } @@ -19,6 +20,7 @@ call void @llvm.write_register.i64(metadata !1, i64 0) call void @llvm.write_register.i64(metadata !1, i64 -1) call void @llvm.write_register.i64(metadata !1, i64 %val) + call void @llvm.amdgcn.wave.barrier() #1 ret void } @@ -30,6 +32,7 @@ call void @llvm.write_register.i64(metadata !2, i64 0) call void @llvm.write_register.i64(metadata !2, i64 -1) call void @llvm.write_register.i64(metadata !2, i64 %val) + call void @llvm.amdgcn.wave.barrier() #1 ret void } @@ -39,6 +42,7 @@ define amdgpu_kernel void @test_write_flat_scratch_lo(i32 %val) #0 { call void @llvm.write_register.i32(metadata !3, i32 0) call void @llvm.write_register.i32(metadata !3, i32 %val) + call void @llvm.amdgcn.wave.barrier() #1 ret void } @@ -48,6 +52,7 @@ define amdgpu_kernel void @test_write_flat_scratch_hi(i32 %val) #0 { call void @llvm.write_register.i32(metadata !4, i32 0) call void @llvm.write_register.i32(metadata !4, i32 %val) + call void @llvm.amdgcn.wave.barrier() #1 ret void } @@ -57,6 +62,7 @@ define amdgpu_kernel void @test_write_exec_lo(i32 %val) #0 { call void @llvm.write_register.i32(metadata !5, i32 0) call void @llvm.write_register.i32(metadata !5, i32 %val) + call void @llvm.amdgcn.wave.barrier() #1 ret void } @@ -66,10 +72,14 @@ define amdgpu_kernel void @test_write_exec_hi(i32 %val) #0 { call void @llvm.write_register.i32(metadata !6, i32 0) call void @llvm.write_register.i32(metadata !6, i32 %val) + call void @llvm.amdgcn.wave.barrier() #1 ret void } +declare void @llvm.amdgcn.wave.barrier() #1 + attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } !0 = !{!"m0"} !1 = !{!"exec"}