Index: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -149,9 +149,19 @@ MachineOperand &ImpDefSCC = MI.getOperand(4); assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); + // If there is only one use of save exec register and that use is SI_END_CF, + // we can optimize SI_IF by returning the full saved exec mask instead of + // just cleared bits. + bool SimpleIf = false; + auto U = MRI->use_instr_nodbg_begin(SaveExecReg); + SimpleIf = U != MRI->use_instr_nodbg_end() && + std::next(U) == MRI->use_instr_nodbg_end() && + U->getOpcode() == AMDGPU::SI_END_CF; + // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. - unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CopyReg = SimpleIf ? SaveExecReg + : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); MachineInstr *CopyExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) .addReg(AMDGPU::EXEC) @@ -166,11 +176,14 @@ .addReg(Cond.getReg()); setImpSCCDefDead(*And, true); - MachineInstr *Xor = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) - .addReg(Tmp) - .addReg(CopyReg); - setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + MachineInstr *Xor = nullptr; + if (!SimpleIf) { + Xor = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) + .addReg(Tmp) + .addReg(CopyReg); + setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + } // Use a copy that is a terminator to get correct spill code placement it with // fast regalloc. @@ -194,7 +207,8 @@ // register. LIS->ReplaceMachineInstrInMaps(MI, *And); - LIS->InsertMachineInstrInMaps(*Xor); + if (!SimpleIf) + LIS->InsertMachineInstrInMaps(*Xor); LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); @@ -207,7 +221,8 @@ LIS->removeInterval(SaveExecReg); LIS->createAndComputeVirtRegInterval(SaveExecReg); LIS->createAndComputeVirtRegInterval(Tmp); - LIS->createAndComputeVirtRegInterval(CopyReg); + if (!SimpleIf) + LIS->createAndComputeVirtRegInterval(CopyReg); } void SILowerControlFlow::emitElse(MachineInstr &MI) { Index: llvm/trunk/test/CodeGen/AMDGPU/branch-condition-and.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/branch-condition-and.ll +++ llvm/trunk/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -14,7 +14,6 @@ ; GCN-DAG: v_cmp_lt_f32_e32 vcc, ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]] ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]] -; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]] ; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]] ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4 Index: llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll +++ llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -141,7 +141,6 @@ ; GCN: buffer_load_dword ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]] ; GCN: v_nop_e64 ; GCN: v_nop_e64 @@ -385,7 +384,6 @@ ; GCN-LABEL: {{^}}uniform_inside_divergent: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]] @@ -436,7 +434,6 @@ ; GCN-LABEL: {{^}}analyze_mask_branch: ; GCN: v_cmp_lt_f32_e32 vcc ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]] ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]] Index: llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -20,7 +20,6 @@ ; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]] -; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] @@ -101,7 +100,6 @@ ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]] -; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill Index: llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi.ll +++ llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -4,11 +4,9 @@ ; SI-LABEL: {{^}}br_i1_phi: ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; SI: s_and_saveexec_b64 -; SI: s_xor_b64 ; SI: v_mov_b32_e32 [[REG]], -1{{$}} ; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]] ; SI: s_and_saveexec_b64 -; SI: s_xor_b64 ; SI: s_endpgm define amdgpu_kernel void @br_i1_phi(i32 %arg) { bb: Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -137,7 +137,6 @@ ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]] ; SI: buffer_load_dword [[LOAD:v[0-9]+]] ; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]] Index: llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -81,7 +81,6 @@ ; GCN-NEXT: s_or_b64 exec, exec ; GCN: v_cmp_ne_u32_e32 vcc, 0 ; GCN-NEXT: s_and_saveexec_b64 -; GCN-NEXT: s_xor_b64 ; GCN: ; %exit0 ; GCN: buffer_store_dword Index: llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll @@ -11,7 +11,6 @@ ; GCN-NEXT: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb @@ -60,7 +59,6 @@ ; GCN: ; BB#{{[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] ; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]] ; GCN-NEXT: ; %unreachable.bb Index: llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -3,7 +3,6 @@ ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator: ; GCN: v_cmp_eq_u32 ; GCN: s_and_saveexec_b64 -; GCN: s_xor_b64 ; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]] ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable @@ -31,7 +30,6 @@ ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order: ; GCN: v_cmp_ne_u32 ; GCN: s_and_saveexec_b64 -; GCN: s_xor_b64 ; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]] ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable Index: llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll +++ llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -202,7 +202,6 @@ ; CHECK-LABEL: {{^}}test_kill_divergent_loop: ; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc -; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]] ; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: s_cbranch_execz [[EXIT]] @@ -337,7 +336,6 @@ ; CHECK-LABEL: {{^}}if_after_kill_block: ; CHECK: ; BB#0: ; CHECK: s_and_saveexec_b64 -; CHECK: s_xor_b64 ; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]] ; CHECK: v_cmpx_le_f32_e32 vcc, 0, @@ -347,7 +345,6 @@ ; CHECK: v_cmp_neq_f32_e32 vcc, 0, ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc -; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]] ; CHECK-NOT: branch Index: llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -10,7 +10,6 @@ ; CHECK: v_mbcnt_lo_u32_b32_e64 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; BB0_1: ; CHECK: s_load_dword s0, s[0:1], 0xa ; CHECK-NEXT: s_waitcnt lgkmcnt(0) Index: llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll +++ llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -303,7 +303,6 @@ ; GCN-LABEL: {{^}}uniform_inside_divergent: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0 ; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]] ; GCN: s_endpgm @@ -335,7 +334,6 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { @@ -360,7 +358,6 @@ ; GCN-LABEL: {{^}}divergent_if_uniform_if: ; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] ; GCN: s_or_b64 exec, exec, [[MASK]] Index: llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -5,7 +5,6 @@ ; CHECK-LABEL: {{^}}test1: ; CHECK: v_cmp_ne_u32_e32 vcc, 0 ; CHECK: s_and_saveexec_b64 -; CHECK-NEXT: s_xor_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}} ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader @@ -35,7 +34,6 @@ ; CHECK-LABEL: {{^}}test2: ; CHECK: s_and_saveexec_b64 -; CHECK-NEXT: s_xor_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { Index: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll +++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll @@ -18,7 +18,6 @@ ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1 ; SI: s_and_saveexec_b64 -; SI-NEXT: s_xor_b64 ; SI-NEXT: ; mask branch ; v_mov should be after exec modification @@ -66,8 +65,7 @@ ; SI-LABEL: {{^}}simple_test_v_if: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword @@ -94,8 +92,7 @@ ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword @@ -160,8 +157,8 @@ ; SI-LABEL: {{^}}simple_test_v_loop: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: ; mask branch +; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} @@ -202,8 +199,8 @@ ; SI: buffer_load_dword [[VBOUND:v[0-9]+]] ; SI: v_cmp_lt_i32_e32 vcc ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] -; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: ; mask branch +; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] ; Initialize inner condition to false ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader