diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -54,14 +54,14 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { // Match: - // sreg = -1 - // vcc = S_AND_B64 exec, sreg + // sreg = -1 or 0 + // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg // S_CBRANCH_VCC[N]Z // => // S_CBRANCH_EXEC[N]Z // We end up with this pattern sometimes after basic block placement. - // It happens while combining a block which assigns -1 to a saved mask and - // another block which consumes that saved mask and then a branch. + // It happens while combining a block which assigns -1 or 0 to a saved mask + // and another block which consumes that saved mask and then a branch. bool Changed = false; MachineBasicBlock &MBB = *MI.getParent(); const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); @@ -69,6 +69,7 @@ const unsigned CondReg = TRI->getVCC(); const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), E = MBB.rend(); @@ -80,7 +81,8 @@ if (A->modifiesRegister(ExecReg, TRI)) return false; if (A->modifiesRegister(CondReg, TRI)) { - if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) + if (!A->definesRegister(CondReg, TRI) || + (A->getOpcode() != And && A->getOpcode() != AndN2)) return false; break; } @@ -97,9 +99,10 @@ } if (Op1.getReg() != ExecReg) return Changed; - if (Op2.isImm() && Op2.getImm() != -1) + if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) return Changed; + int64_t MaskValue; Register SReg; if (Op2.isReg()) { SReg = Op2.getReg(); @@ -113,28 +116,72 @@ ReadsSreg |= M->readsRegister(SReg, TRI); } if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() || - M->getOperand(1).getImm() != -1) + (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0)) return Changed; - // First if sreg is only used in and instruction fold the immediate - // into that and. + MaskValue = M->getOperand(1).getImm(); + // First if sreg is only used in the AND instruction fold the immediate + // into into the AND. if (!ReadsSreg && Op2.isKill()) { - A->getOperand(2).ChangeToImmediate(-1); + A->getOperand(2).ChangeToImmediate(MaskValue); M->eraseFromParent(); } + } else if (Op2.isImm()) { + MaskValue = Op2.getImm(); } + // Invert mask for s_andn2 + if (A->getOpcode() == AndN2) + MaskValue = MaskValue == 0 ? -1 : 0; + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && MI.killsRegister(CondReg, TRI)) A->eraseFromParent(); bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; if (SReg == ExecReg) { + // EXEC is updated directly if (IsVCCZ) { MI.eraseFromParent(); return true; } MI.setDesc(TII->get(AMDGPU::S_BRANCH)); - } else { + } else if (IsVCCZ && MaskValue == 0) { + // Will always branch + // Remove all succesors shadowed by new unconditional branch + MachineBasicBlock *Parent = MI.getParent(); + SmallVector ToRemove; + bool Found = false; + for (MachineInstr &Term : Parent->terminators()) { + if (Found) { + if (Term.isBranch()) + ToRemove.push_back(&Term); + } else { + Found = Term.isIdenticalTo(MI); + } + } + assert(Found && "conditional branch is not terminator"); + for (auto BranchMI : ToRemove) { + MachineOperand &Dst = BranchMI->getOperand(0); + assert(Dst.isMBB() && "destination is not basic block"); + Parent->removeSuccessor(Dst.getMBB()); + BranchMI->eraseFromParent(); + } + + if (MachineBasicBlock *Succ = Parent->getFallThrough()) { + Parent->removeSuccessor(Succ); + } + + // Rewrite to unconditional branch + MI.setDesc(TII->get(AMDGPU::S_BRANCH)); + } else if (!IsVCCZ && MaskValue == 0) { + // Will never branch + MachineOperand &Dst = MI.getOperand(0); + assert(Dst.isMBB() && "destination is not basic block"); + MI.getParent()->removeSuccessor(Dst.getMBB()); + MI.eraseFromParent(); + return true; + } else if (MaskValue == -1) { + // Depends only on EXEC MI.setDesc( TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); } diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -482,13 +482,10 @@ ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-( ; GCN-NEXT: s_addc_u32 ; GCN-NEXT: s_setpc_b64 - ; GCN-NEXT: [[LONG_BR_0]]: -; GCN: s_setpc_b64 -; GCN: [[LONG_BR_DEST0]] +; GCN: [[LONG_BR_DEST0]]: -; GCN: s_cbranch_vccnz ; GCN-DAG: v_cmp_lt_i32 ; GCN-DAG: v_cmp_ge_i32 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -524,7 +524,7 @@ ; GCN: {{^; %bb.[0-9]}}: ; GCN: s_mov_b64 exec, -; GCN: s_cbranch_vccnz [[BB2]] +; GCN: s_cbranch_execnz [[BB2]] define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -159,7 +159,7 @@ ; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: s_cbranch_vccz BB3_2 +; SI-NEXT: s_branch BB3_2 ; SI-NEXT: BB3_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_nest_ret( diff --git a/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir b/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir --- a/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir @@ -338,3 +338,80 @@ S_CBRANCH_VCCZ %bb.1, implicit killed $vcc S_ENDPGM 0 ... +--- +# GCN-LABEL: name: andn2_execz_mov_vccz +# GCN-NOT: S_MOV_ +# GCN-NOT: S_ANDN2_ +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: andn2_execz_mov_vccz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 0 + $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM 0 +... +--- +# GCN-LABEL: name: andn2_branch_mov_vccz +# GCN-NOT: S_MOV_ +# GCN-NOT: S_ANDN2_ +# GCN: S_BRANCH %bb.1 +name: andn2_branch_mov_vccz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM 0 +... +--- +# GCN-LABEL: name: andn2_execnz_mov_vccnz +# GCN-NOT: S_MOV_ +# GCN-NOT: S_ANDN2_ +# GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec +name: andn2_execnz_mov_vccnz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 0 + $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_ENDPGM 0 +... +--- +# GCN-LABEL: name: andn2_no_branch_mov_vccnz +# GCN-NOT: S_MOV_ +# GCN-NOT: S_ANDN2_ +# GCN-NOT: S_CBRANCH +# GCN-NOT: S_BRANCH +name: andn2_no_branch_mov_vccnz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1327,9 +1327,6 @@ ; SI-NEXT: s_cbranch_vccz BB26_3 ; SI-NEXT: s_branch BB26_4 ; SI-NEXT: BB26_2: -; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-NEXT: s_cbranch_vccnz BB26_4 ; SI-NEXT: BB26_3: ; %if ; SI-NEXT: s_load_dword s1, s[6:7], 0x0 ; SI-NEXT: BB26_4: ; %endif @@ -1350,14 +1347,9 @@ ; VI-NEXT: s_cbranch_scc0 BB26_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_load_dword s1, s[6:7], 0x4 -; VI-NEXT: s_mov_b64 s[2:3], 0 -; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; VI-NEXT: s_cbranch_vccz BB26_3 +; VI-NEXT: s_cbranch_execz BB26_3 ; VI-NEXT: s_branch BB26_4 ; VI-NEXT: BB26_2: -; VI-NEXT: s_mov_b64 s[2:3], -1 -; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; VI-NEXT: s_cbranch_vccnz BB26_4 ; VI-NEXT: BB26_3: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s1, s[6:7], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -367,7 +367,6 @@ ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0 ; GCN: {{^}}[[FLOW]]: -; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]] ; GCN: s_or_b64 exec, exec ; GCN: v_mov_b32_e32 v0, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,32 +11,27 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cbranch_scc0 BB0_2 -; SI-NEXT:; %bb.1: ; %else -; SI-NEXT: s_add_i32 s2, s7, s2 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccz BB0_3 -; SI-NEXT: s_branch BB0_4 -; SI-NEXT:BB0_2: -; SI-NEXT: s_mov_b64 s[8:9], -1 -; SI-NEXT: ; implicit-def: $sgpr2 -; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccnz BB0_4 -; SI-NEXT:BB0_3: ; %if -; SI-NEXT: s_sub_i32 s2, s5, s6 -; SI-NEXT:BB0_4: ; %endif -; SI-NEXT: s_add_i32 s4, s2, s4 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_add_i32 s2, s7, s2 +; SI-NEXT: s_cbranch_execz BB0_3 +; SI-NEXT: s_branch BB0_4 +; SI-NEXT: BB0_2: +; SI-NEXT: ; implicit-def: $sgpr2 +; SI-NEXT: BB0_3: ; %if +; SI-NEXT: s_sub_i32 s2, s5, s6 +; SI-NEXT: BB0_4: ; %endif +; SI-NEXT: s_add_i32 s4, s2, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -69,15 +64,10 @@ ; SI-NEXT: s_load_dword s6, s[0:1], 0x37 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s3, s3, s6 -; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; SI-NEXT: s_cbranch_vccz BB1_3 +; SI-NEXT: s_cbranch_execz BB1_3 ; SI-NEXT: s_branch BB1_4 ; SI-NEXT: BB1_2: -; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: ; implicit-def: $sgpr3 -; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; SI-NEXT: s_cbranch_vccnz BB1_4 ; SI-NEXT: BB1_3: ; %if ; SI-NEXT: s_load_dword s3, s[0:1], 0x1c ; SI-NEXT: s_load_dword s0, s[0:1], 0x25 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -668,7 +668,7 @@ ; GCN-LABEL: {{^}}test_loop_vcc: ; GFX1032: v_cmp_lt_f32_e32 vcc_lo, ; GFX1064: v_cmp_lt_f32_e32 vcc, -; GCN: s_cbranch_vccnz +; GCN: s_cbranch_vccz define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -652,13 +652,11 @@ ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 ; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000 -; CHECK: ; %body +; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] -; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %loop +; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop ; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] -; CHECK: s_cbranch_vccz - -; CHECK: s_cbranch_vccnz [[LOOPHDR]] +; CHECK: s_cbranch_vccz [[LOOPHDR]] ; CHECK: ; %break ; CHECK: ; return