diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -372,12 +372,15 @@ // exit" mask. MachineInstr *And = nullptr, *Or = nullptr; if (!SkipAnding) { - And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Dst) + Register AndReg = MRI->createVirtualRegister(BoolRC); + And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg) .addReg(Exec) .add(MI.getOperand(1)); Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) - .addReg(Dst) + .addReg(AndReg) .add(MI.getOperand(2)); + if (LIS) + LIS->createAndComputeVirtRegInterval(AndReg); } else Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) .add(MI.getOperand(1)) diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -16,29 +16,28 @@ ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x ; CHECK-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 -; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3 +; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5 ; CHECK-NEXT: s_branch BB0_3 ; CHECK-NEXT: BB0_1: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 s[10:11], 0 +; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_and_b64 s[8:9], exec, s[6:7] -; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] -; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; CHECK-NEXT: s_and_b64 s[4:5], s[10:11], exec -; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; CHECK-NEXT: s_mov_b64 s[4:5], s[8:9] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_and_b64 s[10:11], exec, s[6:7] +; CHECK-NEXT: s_or_b64 s[2:3], s[10:11], s[2:3] +; CHECK-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[8:9], exec +; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_cbranch_execz BB0_6 ; CHECK-NEXT: BB0_3: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec ; CHECK-NEXT: s_cmp_lt_u32 s0, 32 -; CHECK-NEXT: s_mov_b64 s[10:11], -1 +; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_cbranch_scc0 BB0_2 ; CHECK-NEXT: ; %bb.4: ; %endif1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -53,9 +52,9 @@ ; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1 ; CHECK-NEXT: s_branch BB0_1 ; CHECK-NEXT: BB0_6: ; %Flow2 -; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[4:5] ; CHECK-NEXT: ; mask branch BB0_8 ; CHECK-NEXT: BB0_7: ; %if1 ; CHECK-NEXT: v_sqrt_f32_e32 v1, v0 @@ -63,6 +62,7 @@ ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; CHECK-NEXT: s_endpgm + ; this is the divergent branch with the condition not marked as divergent start: %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0) diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -3,11 +3,10 @@ ; SI-LABEL: {{^}}i1_copy_from_loop: ; -; SI: [[LOOP:BB0_[0-9]+]]: ; %Flow1 -; SI: s_or_b64 exec, exec, [[EXIT_MASK:s\[[0-9]+:[0-9]+\]]] ; SI: ; %Flow +; SI: s_or_b64 [[EXIT_MASK:s\[[0-9]+:[0-9]+\]]] ; SI: s_and_b64 [[ACCUM_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_MASK:s\[[0-9]+:[0-9]+\]]], exec -; SI: s_or_b64 [[I1_VALUE:s\[[0-9]+:[0-9]+\]]], s[6:7], [[ACCUM_MASK]] +; SI: s_or_b64 [[I1_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[ACCUM_MASK]] ; SI: s_cbranch_execz [[FOR_END_LABEL:BB0_[0-9]+]] ; SI: ; %for.body diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -40,10 +40,9 @@ ; GCN: [[FLOW]]: ; %Flow ; GCN: ; in Loop: Header=BB0_1 Depth=1 -; GCN: s_and_b64 [[BROKEN_MASK]], exec, [[INNER_MASK]] -; GCN: s_or_b64 [[BROKEN_MASK]], [[BROKEN_MASK]], [[ACCUM_MASK]] -; GCN: s_mov_b64 [[ACCUM_MASK]], [[BROKEN_MASK]] -; GCN: s_andn2_b64 exec, exec, [[BROKEN_MASK]] +; GCN: s_and_b64 [[AND_MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]] +; GCN-NEXT: s_or_b64 [[ACCUM_MASK]], [[AND_MASK]], [[ACCUM_MASK]] +; GCN-NEXT: s_andn2_b64 exec, exec, [[ACCUM_MASK]] ; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]] ; GCN: ; %bb.4: ; %bb9 diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -25,22 +25,20 @@ ; GCN: s_mov_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[FLOW2:BB[0-9]+_[0-9]+]]: ; %Flow2 -; GCN: s_or_b64 exec, exec, [[TMP0:s\[[0-9]+:[0-9]+\]]] +; GCN: s_or_b64 exec, exec, [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]] ; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]] -; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[LEFT_OUTER]] -; GCN: s_mov_b64 [[LEFT_OUTER]], [[TMP1]] -; GCN: s_andn2_b64 exec, exec, [[TMP1]] +; GCN: s_or_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[LEFT_OUTER]] +; GCN: s_andn2_b64 exec, exec, [[LEFT_OUTER]] ; GCN: s_cbranch_execz [[IF_BLOCK:BB[0-9]+_[0-9]+]] ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}} -; GCN: s_mov_b64 [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; GCN: s_mov_b64 [[LEFT_INNER]], 0{{$}} ; GCN: ; %Flow ; GCN: s_or_b64 exec, exec, [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]] -; GCN: s_and_b64 [[TMP0]], exec, [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]] -; GCN: s_or_b64 [[TMP0]], [[TMP0]], [[LEFT_INNER]] -; GCN: s_mov_b64 [[LEFT_INNER]], [[TMP0]] -; GCN: s_andn2_b64 exec, exec, [[TMP0]] +; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]] +; GCN: s_or_b64 [[LEFT_INNER]], [[TMP0]], [[LEFT_INNER]] +; GCN: s_andn2_b64 exec, exec, [[LEFT_INNER]] ; GCN: s_cbranch_execz [[FLOW2]] ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}} @@ -82,17 +80,17 @@ ; OPT: llvm.amdgcn.end.cf ; GCN-LABEL: {{^}}multi_if_break_loop: -; GCN: s_mov_b64 [[BROKEN_THREADS_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; GCN: s_mov_b64 [[SAVED_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %Flow4 -; GCN: s_and_b64 [[BROKEN_THREADS_MASK]], exec, [[BROKEN_THREADS_MASK]] -; GCN: s_or_b64 [[BROKEN_THREADS_MASK]], [[BROKEN_THREADS_MASK]], [[SAVED:s\[[0-9]+:[0-9]+\]]] -; GCN: s_andn2_b64 exec, exec, [[BROKEN_THREADS_MASK]] +; GCN: s_and_b64 [[ANDTMP0:s\[[0-9]+:[0-9]+\]]], exec, {{s\[[0-9]+:[0-9]+\]}} +; GCN: s_or_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], [[ANDTMP0]], [[SAVED_MASK]] +; GCN: s_and_b64 [[BROKEN_THREADS_MASK:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, exec +; GCN: s_andn2_b64 exec, exec, [[MASK1]] ; GCN-NEXT: s_cbranch_execz [[LOOP_EXIT:BB[0-9]+_[0-9]+]] ; GCN: ; %bb1{{$}} ; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], -; GCN: s_mov_b64 [[SAVED]], [[BROKEN_THREADS_MASK]] ; GCN: ; %LeafBlock1 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[LOAD0]] @@ -122,7 +120,7 @@ ; GCN: s_branch [[LOOP]] ; GCN: [[LOOP_EXIT]]: ; %Flow6 -; GCN: s_or_b64 exec, exec, [[BROKEN_THREADS_MASK]] +; GCN: s_or_b64 exec, exec, [[SAVED_MASK]] define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -37,9 +37,8 @@ ; SI: ; %endif ; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop -; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]] ; SI: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]] -; SI: s_or_b64 [[LEFT]], [[TMP1]], [[TMP]] +; SI: s_or_b64 [[LEFT]], [[TMP1]], [[LEFT]] ; SI: s_andn2_b64 exec, exec, [[LEFT]] ; SI: s_cbranch_execnz [[LOOP_LABEL]] ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -223,9 +223,8 @@ ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]] ; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], -; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]] -; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]] -; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]] +; SI-NEXT: s_or_b64 [[COND_STATE]], [[TMP1]], [[COND_STATE]] +; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -243,14 +243,12 @@ ; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], [[MASK0]] ; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], [[MASK0]] ; GCN: BB{{.*}}: ; %Flow -; GFX1032: s_and_b32 [[MASK0:s[0-9]+]], exec_lo, [[MASK1]] -; GFX1064: s_and_b64 [[MASK0:s\[[0-9:]+\]]], exec, [[MASK1]] -; GFX1032: s_or_b32 [[MASK0]], [[MASK0]], [[ACC:s[0-9]+]] -; GFX1064: s_or_b64 [[MASK0]], [[MASK0]], [[ACC:s\[[0-9:]+\]]] -; GFX1032: s_mov_b32 [[ACC]], [[MASK0]] -; GFX1064: s_mov_b64 [[ACC]], [[MASK0]] -; GFX1032: s_andn2_b32 exec_lo, exec_lo, [[MASK0]] -; GFX1064: s_andn2_b64 exec, exec, [[MASK0]] +; GFX1032: s_and_b32 [[TMP0:s[0-9]+]], exec_lo, [[MASK1]] +; GFX1064: s_and_b64 [[TMP0:s\[[0-9:]+\]]], exec, [[MASK1]] +; GFX1032: s_or_b32 [[ACC:s[0-9]+]], [[TMP0]], [[ACC]] +; GFX1064: s_or_b64 [[ACC:s\[[0-9:]+\]]], [[TMP0]], [[ACC]] +; GFX1032: s_andn2_b32 exec_lo, exec_lo, [[ACC]] +; GFX1064: s_andn2_b64 exec, exec, [[ACC]] ; GCN: s_cbranch_execz ; GCN: BB{{.*}}: ; GCN: s_load_dword [[LOAD:s[0-9]+]]