diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1874,20 +1874,31 @@ (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0)) >; +def : GCNPat < + (i32 (UniformUnaryFrag SCC)), + (S_CSELECT_B32 (i32 0), (i32 -1)) +>; + def : GCNPat < (i32 (sext i1:$src0)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0) >; -class Ext32Pat : GCNPat < - (i32 (ext i1:$src0)), - (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), - /*src1mod*/(i32 0), /*src1*/(i32 1), $src0) ->; +multiclass Ext32Pat { + def : GCNPat < + (i32 (UniformUnaryFrag SCC)), + (S_CSELECT_B32 (i32 0), (i32 1)) + >; + def : GCNPat < + (i32 (ext i1:$src0)), + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 1), $src0) + >; +} -def : Ext32Pat ; -def : Ext32Pat ; +defm : Ext32Pat ; +defm : Ext32Pat ; // The multiplication scales from [0,1) to the unsigned integer range, // rounding down a bit to avoid unwanted overflow. diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1081,12 +1081,18 @@ >; } -class ZExt_i16_i1_Pat : GCNPat < - (i16 (ext i1:$src)), - (V_CNDMASK_B32_e64 (i32 0/*src0mod*/), (i32 0/*src0*/), - (i32 0/*src1mod*/), (i32 1/*src1*/), - $src) ->; +multiclass ZExt_i16_i1_Pat { + def : GCNPat < + (i16 (UniformUnaryFrag SCC)), + (S_CSELECT_B32 (i32 0), (i32 1)) + >; + def : GCNPat < + (i16 (ext i1:$src)), + (V_CNDMASK_B32_e64 (i32 0/*src0mod*/), (i32 0/*src0*/), + (i32 0/*src1mod*/), (i32 1/*src1*/), + $src) + >; +} foreach vt = [i16, v2i16] in { def : GCNPat < @@ -1135,8 +1141,13 @@ let Predicates = [Has16BitInsts] in { -def : ZExt_i16_i1_Pat; -def : ZExt_i16_i1_Pat; +defm : ZExt_i16_i1_Pat; +defm : ZExt_i16_i1_Pat; + +def : GCNPat < + (i16 (UniformUnaryFrag SCC)), + (S_CSELECT_B32 (i32 0), (i32 -1)) +>; def : GCNPat < (i16 (sext i1:$src)), diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -52,8 +52,10 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s1, s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s2, 0, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s1, -1 +; GCN-NEXT: s_xor_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s1, s1, 1 ; GCN-NEXT: s_cmp_lg_u32 s1, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB3_2 @@ -91,9 +93,11 @@ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s0, s1 -; GCN-NEXT: s_xor_b32 s1, s1, -1 -; GCN-NEXT: s_and_b32 s1, s1, 1 ; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s2, 0, 1 +; GCN-NEXT: s_xor_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s1, s1, 1 ; GCN-NEXT: s_cmp_lg_u32 s1, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir @@ -292,7 +292,9 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: $sgpr0 = COPY [[COPY]] + ; GCN-NEXT: $scc = COPY [[COPY]] + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, 1, implicit $scc + ; GCN-NEXT: $sgpr0 = COPY [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sreg_32(s1) = G_TRUNC %0 %2:sgpr(s32) = G_ANYEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir @@ -231,8 +231,9 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[COPY]], 65536, implicit-def $scc - ; GCN-NEXT: $sgpr0 = COPY [[S_BFE_I32_]] + ; GCN-NEXT: $scc = COPY [[COPY]] + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, -1, implicit $scc + ; GCN-NEXT: $sgpr0 = COPY [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sreg_32(s1) = G_TRUNC %0 %2:sgpr(s32) = G_SEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir @@ -231,8 +231,9 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], 1, implicit-def $scc - ; GCN-NEXT: $sgpr0 = COPY [[S_AND_B32_]] + ; GCN-NEXT: $scc = COPY [[COPY]] + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, 1, implicit $scc + ; GCN-NEXT: $sgpr0 = COPY [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sreg_32(s1) = G_TRUNC %0 %2:sgpr(s32) = G_ZEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -9,8 +9,10 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s0, -1 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cselect_b32 s2, 0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_xor_b32 s1, s1, -1 +; GFX9-NEXT: s_xor_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 @@ -97,8 +99,10 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s0, -1 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cselect_b32 s2, 0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_xor_b32 s1, s1, -1 +; GFX9-NEXT: s_xor_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -15,8 +15,8 @@ ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-NEXT: s_cselect_b32 s4, 0, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; @@ -28,10 +28,9 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_cselect_b32 s4, 0, 1 +; GFX8-NEXT: s_andn2_b32 s4, 1, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -43,10 +42,9 @@ ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_eq_u32 s2, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9-NEXT: s_andn2_b32 s0, 1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: @@ -155,7 +153,9 @@ ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: s_and_b64 s[0:1], vcc, exec +; GCN-NEXT: s_cselect_b32 s0, 0, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; @@ -168,7 +168,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: s_and_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_cselect_b32 s0, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -182,7 +184,9 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0 ; GFX9-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: s_and_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -41,8 +41,10 @@ ; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_wqm_b64 s[4:5], -1 -; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX7-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, 0, 1 +; GFX7-NEXT: s_cmp_lg_u32 s4, 1 +; GFX7-NEXT: s_cbranch_scc1 .LBB0_6 ; GFX7-NEXT: ; %bb.5: ; %if ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -75,8 +77,10 @@ ; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX89-NEXT: s_wqm_b64 s[4:5], -1 -; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX89-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX89-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX89-NEXT: s_cselect_b32 s4, 0, 1 +; GFX89-NEXT: s_cmp_lg_u32 s4, 1 +; GFX89-NEXT: s_cbranch_scc1 .LBB0_6 ; GFX89-NEXT: ; %bb.5: ; %if ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX89-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -110,8 +114,10 @@ ; GFX1064-NEXT: .LBB0_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX1064-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, 0, 1 +; GFX1064-NEXT: s_cmp_lg_u32 s4, 1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB0_6 ; GFX1064-NEXT: ; %bb.5: ; %if ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -144,8 +150,10 @@ ; GFX1032-NEXT: .LBB0_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s4, 0, 1 +; GFX1032-NEXT: s_cmp_lg_u32 s4, 1 +; GFX1032-NEXT: s_cbranch_scc1 .LBB0_6 ; GFX1032-NEXT: ; %bb.5: ; %if ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -182,9 +190,11 @@ ; GFX1164-NEXT: .LBB0_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5] -; GFX1164-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, 0, 1 +; GFX1164-NEXT: s_cmp_lg_u32 s4, 1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB0_6 ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -221,9 +231,11 @@ ; GFX1132-NEXT: .LBB0_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: s_wqm_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s4, 0, 1 +; GFX1132-NEXT: s_cmp_lg_u32 s4, 1 +; GFX1132-NEXT: s_cbranch_scc1 .LBB0_6 ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -248,8 +260,10 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_wqm_b64 s[8:9], -1 ; GFX7-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX7-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX7-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX7-NEXT: s_and_b64 s[4:5], s[8:9], exec +; GFX7-NEXT: s_cselect_b32 s4, 0, 1 +; GFX7-NEXT: s_cmp_lg_u32 s4, 1 +; GFX7-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -305,8 +319,10 @@ ; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 -; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX8-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, 0, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB1_6 ; GFX8-NEXT: ; %bb.5: ; %if ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX8-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -361,8 +377,10 @@ ; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 -; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX9-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, 0, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_6 ; GFX9-NEXT: ; %bb.5: ; %if ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX9-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -424,8 +442,10 @@ ; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX1064-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, 0, 1 +; GFX1064-NEXT: s_cmp_lg_u32 s4, 1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_6 ; GFX1064-NEXT: ; %bb.5: ; %if ; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1064-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -478,8 +498,10 @@ ; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s4, 0, 1 +; GFX1032-NEXT: s_cmp_lg_u32 s4, 1 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_6 ; GFX1032-NEXT: ; %bb.5: ; %if ; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1032-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -550,9 +572,11 @@ ; GFX1164-NEXT: .LBB1_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5] -; GFX1164-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, 0, 1 +; GFX1164-NEXT: s_cmp_lg_u32 s4, 1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_6 ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v4, off, s[0:3], 0 ; GFX1164-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -613,9 +637,11 @@ ; GFX1132-NEXT: .LBB1_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: s_wqm_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s4, 0, 1 +; GFX1132-NEXT: s_cmp_lg_u32 s4, 1 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_6 ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v4, off, s[0:3], 0 ; GFX1132-NEXT: .LBB1_6: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1496,7 +1496,7 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: s_cmp_lg_u32 s5, 0 -; SI-NEXT: s_cbranch_scc0 .LBB14_4 +; SI-NEXT: s_cbranch_scc0 .LBB14_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 @@ -1504,22 +1504,26 @@ ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: s_cbranch_execnz .LBB14_3 -; SI-NEXT: .LBB14_2: ; %if +; SI-NEXT: s_branch .LBB14_3 +; SI-NEXT: .LBB14_2: +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: .LBB14_3: ; %Flow +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cselect_b32 s2, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s2, 1 +; SI-NEXT: s_cbranch_scc1 .LBB14_5 +; SI-NEXT: ; %bb.4: ; %if ; SI-NEXT: s_and_b32 s2, s4, 0xffff ; SI-NEXT: s_bcnt1_i32_b32 s2, s2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: .LBB14_3: ; %endif +; SI-NEXT: .LBB14_5: ; %endif ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB14_4: -; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_branch .LBB14_2 ; ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry @@ -1529,7 +1533,7 @@ ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_cmp_ne_u16_e64 s[6:7], s5, 0 ; VI-NEXT: s_and_b64 vcc, exec, s[6:7] -; VI-NEXT: s_cbranch_vccz .LBB14_4 +; VI-NEXT: s_cbranch_vccz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -1537,22 +1541,26 @@ ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 ; VI-NEXT: s_mov_b64 s[2:3], 0 -; VI-NEXT: s_cbranch_execnz .LBB14_3 -; VI-NEXT: .LBB14_2: ; %if +; VI-NEXT: s_branch .LBB14_3 +; VI-NEXT: .LBB14_2: +; VI-NEXT: s_mov_b64 s[2:3], -1 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: .LBB14_3: ; %Flow +; VI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; VI-NEXT: s_cselect_b32 s2, 0, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 1 +; VI-NEXT: s_cbranch_scc1 .LBB14_5 +; VI-NEXT: ; %bb.4: ; %if ; VI-NEXT: s_and_b32 s2, s4, 0xffff ; VI-NEXT: s_bcnt1_i32_b32 s2, s2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: .LBB14_3: ; %endif +; VI-NEXT: .LBB14_5: ; %endif ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB14_4: -; VI-NEXT: s_mov_b64 s[2:3], -1 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: s_branch .LBB14_2 ; ; EG-LABEL: ctpop_i16_in_br: ; EG: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll @@ -20,23 +20,23 @@ ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec - ; CHECK-NEXT: %20:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: %1:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, %20, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: %2:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, %1, 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.bb11: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %1, %bb.1 - ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %2, %bb.1 - ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_2]], %bb.1 - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI2]], implicit $exec + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_FMAC_F32_e64_1]], %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_ADD_F32_e64_]], %bb.1 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_2]], %bb.1 + ; CHECK-NEXT: $scc = COPY [[PHI2]] + ; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, 1, implicit $scc ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]] - ; CHECK-NEXT: S_CMP_LG_U32 killed [[COPY1]], killed [[S_MOV_B32_3]], implicit-def $scc - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY2]], implicit-def dead $scc + ; CHECK-NEXT: S_CMP_LG_U32 killed [[S_CSELECT_B32_]], killed [[S_MOV_B32_3]], implicit-def $scc + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY1]], implicit-def dead $scc ; CHECK-NEXT: $vcc_lo = COPY [[S_AND_B32_1]] ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.4, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.3 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll @@ -54,7 +54,7 @@ ; GCN-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 2, [[S_LOAD_DWORD_IMM]], implicit-def dead $scc ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_1]], 0, implicit-def $scc ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]] + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[COPY2]] ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -62,24 +62,28 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1.out.true: ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[COPY3]], killed [[S_MOV_B64_]], implicit-def dead $scc + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[COPY3]], killed [[S_MOV_B64_]], implicit-def dead $scc ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1 ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0 ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_3]], %subreg.sub2, killed [[S_MOV_B32_2]], %subreg.sub3 - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) + ; GCN-NEXT: $scc = COPY [[S_XOR_B64_]] + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, 1, implicit $scc + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_]] + ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY6]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.out.else: - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0 ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[S_MOV_B32_5]], %subreg.sub2, killed [[S_MOV_B32_4]], %subreg.sub3 - ; GCN-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[COPY3]], implicit $exec - ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_1]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_5]], %subreg.sub2, killed [[S_MOV_B32_4]], %subreg.sub3 + ; GCN-NEXT: $scc = COPY [[COPY3]] + ; GCN-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, 1, implicit $scc + ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_1]] + ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 entry: %0 = and i32 %x, 2 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -21,9 +21,11 @@ ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY4]], killed [[COPY3]], implicit-def dead $scc - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec - ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) + ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[COPY4]], killed [[COPY3]], implicit-def dead $scc + ; GCN-NEXT: $scc = COPY [[S_OR_B64_]] + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, 1, implicit $scc + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_]] + ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %setcc = icmp slt i16 %x, 0 %select = select i1 %setcc, i1 true, i1 %z @@ -73,9 +75,11 @@ ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_CMP_LT_I32 killed [[COPY3]], killed [[S_MOV_B32_2]], implicit-def $scc ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY6]], killed [[COPY5]], implicit-def dead $scc - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec - ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) + ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[COPY6]], killed [[COPY5]], implicit-def dead $scc + ; GCN-NEXT: $scc = COPY [[S_OR_B64_]] + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, 1, implicit $scc + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_]] + ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %setcc = icmp slt i32 %x, 0 %select = select i1 %setcc, i1 true, i1 %z @@ -128,9 +132,11 @@ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]] ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec - ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec - ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1) + ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc + ; GCN-NEXT: $scc = COPY [[S_OR_B64_]] + ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 0, 1, implicit $scc + ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_]] + ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %setcc = icmp slt i64 %x, 0 %select = select i1 %setcc, i1 true, i1 %z diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -9,26 +9,23 @@ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: s_add_u32 s24, s24, s7 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 -; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 ; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 16 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 24 ; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; CHECK-NEXT: s_xor_b64 s[2:3], s[6:7], -1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 -; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s1, 8 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[14:15] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 -; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_and_b64 s[4:5], exec, s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s1, 8 +; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 +; CHECK-NEXT: s_and_b64 s[0:1], exec, s[2:3] +; CHECK-NEXT: s_and_b64 s[2:3], exec, s[8:9] ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[18:19], 0 @@ -38,90 +35,105 @@ ; CHECK-NEXT: .LBB0_2: ; %Flow7 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_12 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_13 ; CHECK-NEXT: .LBB0_3: ; %bb7 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 +; CHECK-NEXT: s_and_b64 s[16:17], s[10:11], exec +; CHECK-NEXT: s_cselect_b32 s16, 0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s16, 1 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.4: ; %bb8 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[2:3] +; CHECK-NEXT: s_mov_b64 vcc, s[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 ; CHECK-NEXT: s_mov_b64 s[18:19], -1 ; CHECK-NEXT: s_mov_b64 s[22:23], s[8:9] -; CHECK-NEXT: s_cbranch_execz .LBB0_7 -; CHECK-NEXT: s_branch .LBB0_8 +; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_6: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], -1 ; CHECK-NEXT: s_mov_b64 s[18:19], 0 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: .LBB0_7: ; %bb10 +; CHECK-NEXT: .LBB0_7: ; %Flow8 +; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: s_and_b64 s[20:21], s[16:17], exec +; CHECK-NEXT: s_cselect_b32 s20, 0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s20, 1 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_9 +; CHECK-NEXT: ; %bb.8: ; %bb10 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[18:19], -1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[22:23], s[12:13] -; CHECK-NEXT: .LBB0_8: ; %Flow9 +; CHECK-NEXT: s_mov_b64 s[22:23], s[14:15] +; CHECK-NEXT: .LBB0_9: ; %Flow9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: s_and_b64 s[22:23], s[22:23], exec +; CHECK-NEXT: s_cselect_b32 s22, 0, 1 ; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; CHECK-NEXT: s_cmp_lg_u32 s22, 1 ; CHECK-NEXT: s_mov_b64 s[22:23], -1 -; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 -; CHECK-NEXT: ; %bb.9: ; %bb13 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 +; CHECK-NEXT: ; %bb.10: ; %bb13 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[4:5] -; CHECK-NEXT: s_cbranch_vccz .LBB0_11 -; CHECK-NEXT: ; %bb.10: ; %bb16 +; CHECK-NEXT: s_mov_b64 vcc, s[2:3] +; CHECK-NEXT: s_cbranch_vccz .LBB0_12 +; CHECK-NEXT: ; %bb.11: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 ; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] +; CHECK-NEXT: s_mov_b64 s[22:23], s[12:13] ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] ; CHECK-NEXT: s_branch .LBB0_2 -; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 ; CHECK-NEXT: ; implicit-def: $sgpr16_sgpr17 ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] ; CHECK-NEXT: s_branch .LBB0_2 -; CHECK-NEXT: .LBB0_12: ; %loop.exit.guard6 +; CHECK-NEXT: .LBB0_13: ; %loop.exit.guard6 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_xor_b64 s[14:15], s[20:21], -1 +; CHECK-NEXT: s_xor_b64 s[22:23], s[20:21], -1 ; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[14:15] -; CHECK-NEXT: s_cbranch_vccz .LBB0_16 -; CHECK-NEXT: ; %bb.13: ; %bb14 +; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] +; CHECK-NEXT: s_cbranch_vccz .LBB0_17 +; CHECK-NEXT: ; %bb.14: ; %bb14 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_15 -; CHECK-NEXT: ; %bb.14: ; %bb15 +; CHECK-NEXT: s_and_b64 s[20:21], s[4:5], exec +; CHECK-NEXT: s_cselect_b32 s20, 0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s20, 1 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_16 +; CHECK-NEXT: ; %bb.15: ; %bb15 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:4 -; CHECK-NEXT: buffer_store_dword v1, off, s[24:27], 0 -; CHECK-NEXT: .LBB0_15: ; %Flow +; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 +; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 +; CHECK-NEXT: .LBB0_16: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 -; CHECK-NEXT: .LBB0_16: ; %Flow13 +; CHECK-NEXT: .LBB0_17: ; %Flow13 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[20:21] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_3 -; CHECK-NEXT: ; %bb.17: ; %loop.exit.guard +; CHECK-NEXT: s_and_b64 s[20:21], s[20:21], exec +; CHECK-NEXT: s_cselect_b32 s20, 0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s20, 1 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 +; CHECK-NEXT: ; %bb.18: ; %loop.exit.guard ; CHECK-NEXT: s_and_b64 vcc, exec, s[16:17] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_22 -; CHECK-NEXT: ; %bb.18: ; %loop.exit.guard5 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_23 +; CHECK-NEXT: ; %bb.19: ; %loop.exit.guard5 ; CHECK-NEXT: s_and_b64 vcc, exec, s[18:19] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_22 -; CHECK-NEXT: ; %bb.19: ; %bb17 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_23 +; CHECK-NEXT: ; %bb.20: ; %bb17 ; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] -; CHECK-NEXT: s_cbranch_vccz .LBB0_21 -; CHECK-NEXT: ; %bb.20: ; %bb19 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_cbranch_vccz .LBB0_22 -; CHECK-NEXT: .LBB0_21: ; %bb21 +; CHECK-NEXT: ; %bb.21: ; %bb19 +; CHECK-NEXT: s_and_b64 s[0:1], s[4:5], exec +; CHECK-NEXT: s_cselect_b32 s0, 0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 1 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_23 +; CHECK-NEXT: .LBB0_22: ; %bb21 ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB0_22: ; %UnifiedUnreachableBlock +; CHECK-NEXT: .LBB0_23: ; %UnifiedUnreachableBlock bb: br label %bb6 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -95,17 +95,18 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s0, s2, s2 -; GFX7-NEXT: s_cmp_lt_u32 s0, s2 +; GFX7-NEXT: s_add_i32 s0, s4, s4 +; GFX7-NEXT: s_cmp_lt_u32 s0, s4 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX7-NEXT: s_and_b64 s[2:3], s[0:1], exec +; GFX7-NEXT: s_cselect_b32 s2, 0, 1 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_cmp_lg_u32 s0, 0 -; GFX7-NEXT: s_addc_u32 s0, s2, 0 -; GFX7-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0 -; GFX7-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX7-NEXT: s_addc_u32 s0, s4, 0 +; GFX7-NEXT: s_cmp_ge_u32 s0, s2 +; GFX7-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX7-NEXT: ; %bb.1: ; %bb0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -122,16 +123,17 @@ ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s2, s2 -; GFX9-NEXT: s_cmp_lt_u32 s0, s2 +; GFX9-NEXT: s_add_i32 s0, s4, s4 +; GFX9-NEXT: s_cmp_lt_u32 s0, s4 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s2, 0, 1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: s_addc_u32 s0, s2, 0 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0 -; GFX9-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX9-NEXT: s_addc_u32 s0, s4, 0 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -153,11 +155,12 @@ ; GFX10-NEXT: s_add_i32 s1, s0, s0 ; GFX10-NEXT: s_cmp_lt_u32 s1, s0 ; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX10-NEXT: s_and_b32 s2, s1, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, 0, 1 ; GFX10-NEXT: s_cmpk_lg_u32 s1, 0x0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 -; GFX10-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0 -; GFX10-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX10-NEXT: s_cmp_ge_u32 s0, s2 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -180,12 +183,13 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lt_u32 s1, s0 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX11-NEXT: s_and_b32 s2, s1, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, 0, 1 ; GFX11-NEXT: s_cmpk_lg_u32 s1, 0x0 ; GFX11-NEXT: s_addc_u32 s0, s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0 -; GFX11-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_ge_u32 s0, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 9 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -32,15 +32,19 @@ ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB0_3 -; SI-NEXT: s_branch .LBB0_4 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB0_3 ; SI-NEXT: .LBB0_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB0_3: ; %T +; SI-NEXT: .LBB0_3: ; %Flow +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cbranch_scc1 .LBB0_5 +; SI-NEXT: ; %bb.4: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 @@ -65,7 +69,7 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: .LBB0_4: ; %exit +; SI-NEXT: .LBB0_5: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -94,14 +98,20 @@ ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-NEXT: s_branch .LBB0_4 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB0_3 ; GFX9-NEXT: .LBB0_2: +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX9-NEXT: .LBB0_3: ; %T +; GFX9-NEXT: .LBB0_3: ; %Flow +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, 0, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB0_5 +; GFX9-NEXT: ; %bb.4: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB0_4: ; %exit +; GFX9-NEXT: .LBB0_5: ; %exit ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 @@ -162,16 +172,20 @@ ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB1_3 -; SI-NEXT: s_branch .LBB1_4 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB1_3 ; SI-NEXT: .LBB1_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB1_3: ; %T +; SI-NEXT: .LBB1_3: ; %Flow +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cbranch_scc1 .LBB1_5 +; SI-NEXT: ; %bb.4: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 @@ -196,7 +210,7 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v0 ; SI-NEXT: v_or_b32_e32 v5, v5, v1 -; SI-NEXT: .LBB1_4: ; %exit +; SI-NEXT: .LBB1_5: ; %exit ; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 @@ -226,14 +240,20 @@ ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-NEXT: s_branch .LBB1_4 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB1_3 ; GFX9-NEXT: .LBB1_2: +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX9-NEXT: .LBB1_3: ; %T +; GFX9-NEXT: .LBB1_3: ; %Flow +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, 0, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_5 +; GFX9-NEXT: ; %bb.4: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB1_4: ; %exit +; GFX9-NEXT: .LBB1_5: ; %exit ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 @@ -297,15 +317,19 @@ ; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB2_3 -; SI-NEXT: s_branch .LBB2_4 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB2_3 ; SI-NEXT: .LBB2_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB2_3: ; %T +; SI-NEXT: .LBB2_3: ; %Flow +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cbranch_scc1 .LBB2_5 +; SI-NEXT: ; %bb.4: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 @@ -333,7 +357,7 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: .LBB2_4: ; %exit +; SI-NEXT: .LBB2_5: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -358,14 +382,20 @@ ; GFX9-NEXT: ; %bb.1: ; %F ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-NEXT: s_branch .LBB2_4 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB2_3 ; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX9-NEXT: .LBB2_3: ; %T +; GFX9-NEXT: .LBB2_3: ; %Flow +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, 0, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX9-NEXT: ; %bb.4: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB2_4: ; %exit +; GFX9-NEXT: .LBB2_5: ; %exit ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4 @@ -448,15 +478,19 @@ ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB3_3 -; SI-NEXT: s_branch .LBB3_4 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB3_3 ; SI-NEXT: .LBB3_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB3_3: ; %T +; SI-NEXT: .LBB3_3: ; %Flow +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cbranch_scc1 .LBB3_5 +; SI-NEXT: ; %bb.4: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 @@ -497,7 +531,7 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: .LBB3_4: ; %exit +; SI-NEXT: .LBB3_5: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -529,18 +563,24 @@ ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: s_cbranch_execz .LBB3_3 -; GFX9-NEXT: s_branch .LBB3_4 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB3_3 ; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX9-NEXT: .LBB3_3: ; %T +; GFX9-NEXT: .LBB3_3: ; %Flow +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, 0, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX9-NEXT: ; %bb.4: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: .LBB3_4: ; %exit +; GFX9-NEXT: .LBB3_5: ; %exit ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 @@ -618,16 +658,20 @@ ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v7, v2 ; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB4_3 -; SI-NEXT: s_branch .LBB4_4 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB4_3 ; SI-NEXT: .LBB4_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB4_3: ; %T +; SI-NEXT: .LBB4_3: ; %Flow +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cbranch_scc1 .LBB4_5 +; SI-NEXT: ; %bb.4: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 @@ -668,7 +712,7 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: .LBB4_4: ; %exit +; SI-NEXT: .LBB4_5: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -701,18 +745,24 @@ ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-NEXT: s_branch .LBB4_4 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB4_3 ; GFX9-NEXT: .LBB4_2: +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX9-NEXT: .LBB4_3: ; %T +; GFX9-NEXT: .LBB4_3: ; %Flow +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, 0, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB4_5 +; GFX9-NEXT: ; %bb.4: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: .LBB4_4: ; %exit +; GFX9-NEXT: .LBB4_5: ; %exit ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 @@ -793,15 +843,19 @@ ; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB5_3 -; SI-NEXT: s_branch .LBB5_4 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB5_3 ; SI-NEXT: .LBB5_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB5_3: ; %T +; SI-NEXT: .LBB5_3: ; %Flow +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cbranch_scc1 .LBB5_5 +; SI-NEXT: ; %bb.4: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 @@ -845,7 +899,7 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: .LBB5_4: ; %exit +; SI-NEXT: .LBB5_5: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -873,18 +927,24 @@ ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: s_cbranch_execz .LBB5_3 -; GFX9-NEXT: s_branch .LBB5_4 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB5_3 ; GFX9-NEXT: .LBB5_2: +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX9-NEXT: .LBB5_3: ; %T +; GFX9-NEXT: .LBB5_3: ; %Flow +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, 0, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_5 +; GFX9-NEXT: ; %bb.4: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: .LBB5_4: ; %exit +; GFX9-NEXT: .LBB5_5: ; %exit ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -740,13 +740,15 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s4 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s2 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -758,7 +760,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s4 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; VI-NEXT: s_cselect_b32 s4, 0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -790,13 +794,15 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s4| -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s2| +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -808,7 +814,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s4| -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; VI-NEXT: s_cselect_b32 s4, 0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -622,13 +622,15 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s4 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s2 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -640,7 +642,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s4 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; VI-NEXT: s_cselect_b32 s4, 0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -672,13 +676,15 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s4| -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s2| +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -690,7 +696,9 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s4| -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; VI-NEXT: s_cselect_b32 s4, 0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -168,8 +168,8 @@ ; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V3-NEXT: s_cselect_b32 s0, 0, 1 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V3-NEXT: flat_store_dword v[0:1], v0 ; GFX8V3-NEXT: s_waitcnt vmcnt(0) ; GFX8V3-NEXT: s_endpgm @@ -180,8 +180,8 @@ ; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V4-NEXT: s_cselect_b32 s0, 0, 1 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: s_endpgm @@ -192,8 +192,8 @@ ; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V5-NEXT: s_cselect_b32 s0, 0, 1 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm @@ -205,8 +205,8 @@ ; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V3-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off ; GFX9V3-NEXT: s_waitcnt vmcnt(0) ; GFX9V3-NEXT: s_endpgm @@ -218,8 +218,8 @@ ; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V4-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off ; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: s_endpgm @@ -231,8 +231,8 @@ ; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V5-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: s_endpgm @@ -249,8 +249,8 @@ ; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V3-NEXT: s_cselect_b32 s0, 0, 1 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V3-NEXT: flat_store_dword v[0:1], v0 ; GFX8V3-NEXT: s_waitcnt vmcnt(0) ; GFX8V3-NEXT: s_endpgm @@ -261,8 +261,8 @@ ; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V4-NEXT: s_cselect_b32 s0, 0, 1 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: s_endpgm @@ -273,8 +273,8 @@ ; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V5-NEXT: s_cselect_b32 s0, 0, 1 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm @@ -286,8 +286,8 @@ ; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V3-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off ; GFX9V3-NEXT: s_waitcnt vmcnt(0) ; GFX9V3-NEXT: s_endpgm @@ -299,8 +299,8 @@ ; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V4-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off ; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: s_endpgm @@ -312,8 +312,8 @@ ; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V5-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1417,17 +1417,22 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s6, 0 -; SI-NEXT: s_cbranch_scc0 .LBB30_4 +; SI-NEXT: s_cbranch_scc0 .LBB30_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dword s7, s[2:3], 0x1 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_branch .LBB30_3 +; SI-NEXT: .LBB30_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB30_3: ; %Flow +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cbranch_scc1 .LBB30_5 +; SI-NEXT: ; %bb.4: ; %if ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB30_3 -; SI-NEXT: .LBB30_2: ; %if ; SI-NEXT: s_load_dword s7, s[2:3], 0x0 -; SI-NEXT: .LBB30_3: ; %endif +; SI-NEXT: .LBB30_5: ; %endif ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -1435,8 +1440,6 @@ ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB30_4: -; SI-NEXT: s_branch .LBB30_2 ; ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry @@ -1444,14 +1447,22 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s6, 0 -; VI-NEXT: s_cbranch_scc0 .LBB30_4 +; VI-NEXT: s_cbranch_scc0 .LBB30_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_load_dword s7, s[2:3], 0x4 -; VI-NEXT: s_cbranch_execnz .LBB30_3 -; VI-NEXT: .LBB30_2: ; %if +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB30_3 +; VI-NEXT: .LBB30_2: +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: .LBB30_3: ; %Flow +; VI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; VI-NEXT: s_cselect_b32 s4, 0, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cbranch_scc1 .LBB30_5 +; VI-NEXT: ; %bb.4: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s7, s[2:3], 0x0 -; VI-NEXT: .LBB30_3: ; %endif +; VI-NEXT: .LBB30_5: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 @@ -1459,8 +1470,6 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB30_4: -; VI-NEXT: s_branch .LBB30_2 entry: %0 = insertelement <2 x i32> undef, i32 %a, i32 0 %1 = icmp eq i32 %a, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -58,23 +58,23 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s3, 56 ; GCN-NEXT: s_mov_b64 s[2:3], -1 -; GCN-NEXT: s_cbranch_scc1 .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN-NEXT: s_cbranch_vccz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %.exit -; GCN-NEXT: s_endpgm -; GCN-NEXT: .LBB2_3: ; %.one +; GCN-NEXT: s_cbranch_scc0 .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_mov_b64 s[2:3], 0 -; GCN-NEXT: s_cbranch_execnz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %.zero +; GCN-NEXT: .LBB2_2: ; %Flow +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s2, 0, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cbranch_scc1 .LBB2_4 +; GCN-NEXT: ; %bb.3: ; %.zero ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: .LBB2_4: ; %.exit ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -474,8 +474,8 @@ ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s4, 1 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-NEXT: s_cselect_b32 s4, 0, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm entry: @@ -517,8 +517,8 @@ ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s4, 1 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-NEXT: s_cselect_b32 s4, 0, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -137,17 +137,19 @@ ; SI-NEXT: s_cmp_lt_i32 s9, 0 ; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 ; SI-NEXT: s_cmp_lt_i32 s12, s8 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_xor_b64 s[8:9], s[10:11], s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] +; SI-NEXT: s_and_b64 s[0:1], s[8:9], exec +; SI-NEXT: s_cselect_b32 s0, 0, 1 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -162,13 +164,15 @@ ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_cmp_lt_i32 s4, s0 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; VI-NEXT: s_cselect_b32 s0, 0, 1 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dword v[0:1], v4 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -183,7 +187,7 @@ ; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, 1, 0, vcc ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm @@ -199,7 +203,7 @@ ; GFX10-NEXT: s_add_i32 s0, s2, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, 1, 0, vcc_lo ; GFX10-NEXT: global_store_dword v1, v2, s[4:5] ; GFX10-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-NEXT: s_endpgm @@ -215,7 +219,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, 1, 0, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] @@ -255,7 +259,9 @@ ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cselect_b32 s0, 0, 1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -279,8 +285,10 @@ ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 ; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; VI-NEXT: s_cselect_b32 s0, 0, 1 ; VI-NEXT: flat_store_dword v[0:1], v6 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -296,7 +304,7 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, 1, 0, vcc ; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -312,7 +320,7 @@ ; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1, 0, vcc_lo ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: global_store_byte v0, v2, s[2:3] ; GFX10-NEXT: s_endpgm @@ -330,7 +338,7 @@ ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1, 0, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] @@ -360,17 +368,19 @@ ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 ; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; SI-NEXT: s_cselect_b32 s4, 0, 1 ; SI-NEXT: s_mov_b32 s0, s2 ; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -380,19 +390,21 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_addc_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[3:4] +; VI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] -; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 +; VI-NEXT: s_xor_b64 s[2:3], s[4:5], vcc ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: s_and_b64 s[0:1], s[2:3], exec +; VI-NEXT: s_cselect_b32 s0, 0, 1 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -409,9 +421,11 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_xor_b64 s[4:5], s[10:11], vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -422,12 +436,14 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s8, s4, s6 ; GFX10-NEXT: s_addc_u32 s9, s5, s7 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] ; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, s[6:7], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_xor_b32 s4, s6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: s_xor_b32 s4, s5, s4 +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s4, 0, 1 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: global_store_byte v2, v3, s[2:3] ; GFX10-NEXT: s_endpgm @@ -438,13 +454,16 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s8, s4, s6 ; GFX11-NEXT: s_addc_u32 s9, s5, s7 -; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] -; GFX11-NEXT: v_mov_b32_e32 v0, s8 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s4, s6, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: v_cmp_lt_i64_e64 s5, s[6:7], 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s4, s5, s4 +; GFX11-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11-NEXT: s_cselect_b32 s4, 0, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] @@ -486,7 +505,9 @@ ; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cselect_b32 s0, 0, 1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -511,7 +532,9 @@ ; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] ; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; VI-NEXT: s_cselect_b32 s0, 0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_byte v[6:7], v0 ; VI-NEXT: s_endpgm ; @@ -529,7 +552,9 @@ ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] ; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: global_store_byte v6, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -544,10 +569,12 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX10-NEXT: s_and_b32 s0, s0, exec_lo +; GFX10-NEXT: s_cselect_b32 s0, 0, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] ; GFX10-NEXT: global_store_byte v6, v0, s[6:7] ; GFX10-NEXT: s_endpgm @@ -563,12 +590,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, 0, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] ; GFX11-NEXT: global_store_b8 v6, v0, s[6:7] @@ -608,14 +637,18 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 ; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 +; SI-NEXT: s_xor_b64 s[2:3], vcc, s[2:3] ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cselect_b32 s4, 0, 1 +; SI-NEXT: s_and_b64 s[0:1], s[2:3], exec +; SI-NEXT: s_cselect_b32 s0, 0, 1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: s_endpgm @@ -637,14 +670,18 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 -; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 ; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 +; VI-NEXT: s_xor_b64 s[2:3], vcc, s[2:3] ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; VI-NEXT: s_cselect_b32 s4, 0, 1 +; VI-NEXT: s_and_b64 s[0:1], s[2:3], exec +; VI-NEXT: s_cselect_b32 s0, 0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] ; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_endpgm @@ -662,9 +699,9 @@ ; GFX9-NEXT: v_add_u32_e32 v4, v0, v2 ; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, 1, 0, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1, 0, vcc ; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -683,9 +720,9 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v3, v0, v2 ; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1, 0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, 1, 0, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm @@ -705,9 +742,9 @@ ; GFX11-NEXT: v_add_nc_i32 v0, v0, v2 clamp ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1, 0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, 1, 0, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1] ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -142,7 +142,6 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 @@ -174,17 +173,22 @@ ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 ; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] +; GCN-IR-NEXT: s_and_b64 s[20:21], s[20:21], exec +; GCN-IR-NEXT: s_cselect_b32 s15, 0, 1 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 +; GCN-IR-NEXT: s_cmp_lg_u32 s15, 1 +; GCN-IR-NEXT: s_mov_b32 s15, 0 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s20, s16, 1 ; GCN-IR-NEXT: s_addc_u32 s21, s17, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0 -; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-IR-NEXT: s_cselect_b32 s17, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s16 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10 +; GCN-IR-NEXT: s_cmp_lg_u32 s17, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20 ; GCN-IR-NEXT: s_add_u32 s19, s6, -1 @@ -971,7 +975,6 @@ ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 @@ -1011,17 +1014,22 @@ ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 ; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] +; GCN-IR-NEXT: s_and_b64 s[20:21], s[20:21], exec +; GCN-IR-NEXT: s_cselect_b32 s15, 0, 1 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 +; GCN-IR-NEXT: s_cmp_lg_u32 s15, 1 +; GCN-IR-NEXT: s_mov_b32 s15, 0 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s20, s16, 1 ; GCN-IR-NEXT: s_addc_u32 s21, s17, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0 -; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-IR-NEXT: s_cselect_b32 s17, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s16 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10 +; GCN-IR-NEXT: s_cmp_lg_u32 s17, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20 ; GCN-IR-NEXT: s_add_u32 s19, s6, -1 @@ -1216,17 +1224,21 @@ ; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec ; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, 1 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 1 ; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s11, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s11 -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s8 +; GCN-IR-NEXT: s_cmp_lg_u32 s11, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB10_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s14 ; GCN-IR-NEXT: s_add_u32 s16, s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll b/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll @@ -16,7 +16,7 @@ ; CHECK-NEXT: ds_read_b32 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; CHECK-NEXT: v_cndmask_b32_e64 v1, 1, 0, vcc_lo ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -15,13 +15,22 @@ ; SI-NEXT: s_load_dword s2, s[0:1], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cbranch_scc0 .LBB0_4 +; SI-NEXT: s_cbranch_scc0 .LBB0_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_add_i32 s7, s7, s2 -; SI-NEXT: s_cbranch_execnz .LBB0_3 -; SI-NEXT: .LBB0_2: ; %if +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_branch .LBB0_3 +; SI-NEXT: .LBB0_2: +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: .LBB0_3: ; %Flow +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cselect_b32 s2, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s2, 1 +; SI-NEXT: s_cbranch_scc1 .LBB0_5 +; SI-NEXT: ; %bb.4: ; %if ; SI-NEXT: s_sub_i32 s7, s5, s6 -; SI-NEXT: .LBB0_3: ; %endif +; SI-NEXT: .LBB0_5: ; %endif ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_add_i32 s4, s7, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -30,9 +39,6 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB0_4: -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: s_branch .LBB0_2 entry: %0 = icmp eq i32 %a, 0 @@ -59,19 +65,28 @@ ; SI-NEXT: s_load_dword s4, s[0:1], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: s_cbranch_scc0 .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dword s2, s[0:1], 0x2e ; SI-NEXT: s_load_dword s3, s[0:1], 0x37 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s5, s2, s3 -; SI-NEXT: s_cbranch_execnz .LBB1_3 -; SI-NEXT: .LBB1_2: ; %if +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_branch .LBB1_3 +; SI-NEXT: .LBB1_2: +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: .LBB1_3: ; %Flow +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cselect_b32 s2, 0, 1 +; SI-NEXT: s_cmp_lg_u32 s2, 1 +; SI-NEXT: s_cbranch_scc1 .LBB1_5 +; SI-NEXT: ; %bb.4: ; %if ; SI-NEXT: s_load_dword s2, s[0:1], 0x1c ; SI-NEXT: s_load_dword s3, s[0:1], 0x25 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s5, s2, s3 -; SI-NEXT: .LBB1_3: ; %endif +; SI-NEXT: .LBB1_5: ; %endif ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_add_i32 s4, s5, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -80,9 +95,6 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB1_4: -; SI-NEXT: ; implicit-def: $sgpr5 -; SI-NEXT: s_branch .LBB1_2 entry: %cmp0 = icmp eq i32 %a, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -11,9 +11,9 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, s3 ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_cselect_b32 s0, 0, -1 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -25,9 +25,9 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, s3 ; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_cselect_b32 s0, 0, -1 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b @@ -227,9 +227,9 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, s3 ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_cselect_b32 s0, 0, -1 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -241,9 +241,9 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, s3 ; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_cselect_b32 s0, 0, -1 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b @@ -269,7 +269,9 @@ ; SI-NEXT: s_cmp_eq_u32 s6, s7 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 0, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -285,7 +287,9 @@ ; VI-NEXT: s_cmp_eq_u32 s6, s7 ; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; VI-NEXT: s_cselect_b32 s4, 0, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp0 = icmp eq i32 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -145,18 +145,22 @@ ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, 1 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GCN-IR-NEXT: s_cmp_lg_u32 s11, 1 ; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_cselect_b32 s13, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_cmp_lg_u32 s13, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 @@ -1013,7 +1017,6 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31 @@ -1047,17 +1050,22 @@ ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] +; GCN-IR-NEXT: s_and_b64 s[18:19], s[18:19], exec +; GCN-IR-NEXT: s_cselect_b32 s13, 0, 1 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 +; GCN-IR-NEXT: s_cmp_lg_u32 s13, 1 +; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s18, s14, 1 ; GCN-IR-NEXT: s_addc_u32 s19, s15, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-IR-NEXT: s_cselect_b32 s15, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s14 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GCN-IR-NEXT: s_cmp_lg_u32 s15, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s18 ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 @@ -1158,7 +1166,6 @@ ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 @@ -1198,17 +1205,22 @@ ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s5 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s4 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] +; GCN-IR-NEXT: s_and_b64 s[18:19], s[18:19], exec +; GCN-IR-NEXT: s_cselect_b32 s13, 0, 1 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 +; GCN-IR-NEXT: s_cmp_lg_u32 s13, 1 +; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s18, s14, 1 ; GCN-IR-NEXT: s_addc_u32 s19, s15, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s14 -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-IR-NEXT: s_cselect_b32 s15, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s10, 63, s14 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GCN-IR-NEXT: s_cmp_lg_u32 s15, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[4:5], s18 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 @@ -1406,17 +1418,21 @@ ; GCN-IR-NEXT: s_and_b64 s[10:11], s[12:13], exec ; GCN-IR-NEXT: s_cselect_b32 s10, 0, 24 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, 1 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 1 ; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s2, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s3, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, 1 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] ; GCN-IR-NEXT: s_lshl_b64 s[2:3], 24, s2 -; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB10_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 ; GCN-IR-NEXT: s_add_u32 s14, s4, -1 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -30,68 +30,66 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, i32 %tmp5.i.i, i32 %tmp427.i, i1 %tmp438.i, double %tmp27.i, i1 %tmp48.i) { ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb -; GLOBALNESS1-NEXT: s_mov_b64 s[54:55], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] +; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, v0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v44, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v44, off +; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v0, v44, s[36:37] +; GLOBALNESS1-NEXT: global_load_dword v0, v42, s[76:77] ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GLOBALNESS1-NEXT: s_mov_b64 s[64:65], s[4:5] -; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 -; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s38, 0 -; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[44:45] -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[94:95], s[4:5], -1 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 +; GLOBALNESS1-NEXT: s_nop 0 +; GLOBALNESS1-NEXT: s_load_dword s8, s[8:9], 0x20 +; GLOBALNESS1-NEXT: s_cselect_b64 s[82:83], -1, 0 +; GLOBALNESS1-NEXT: s_xor_b64 s[84:85], s[82:83], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[88:89], s[4:5], -1 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_getpc_b64 s[6:7] -; GLOBALNESS1-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 -; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_xor_b64 s[86:87], s[4:5], -1 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 -; GLOBALNESS1-NEXT: s_mov_b32 s98, s16 -; GLOBALNESS1-NEXT: s_mov_b64 s[62:63], s[8:9] -; GLOBALNESS1-NEXT: s_mov_b32 s99, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s100, s14 +; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GLOBALNESS1-NEXT: s_xor_b64 s[86:87], s[6:7], -1 +; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s8, 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GLOBALNESS1-NEXT: s_xor_b64 s[88:89], s[6:7], -1 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, 0x40994400 +; GLOBALNESS1-NEXT: s_mov_b32 s52, s16 +; GLOBALNESS1-NEXT: s_mov_b32 s53, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s54, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS1-NEXT: s_mov_b64 s[92:93], 0x80 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS1-NEXT: s_mov_b32 s69, 0x3ff00000 +; GLOBALNESS1-NEXT: s_mov_b64 s[80:81], 0x80 +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[42:43] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[100:101], s[4:5], 0 +; GLOBALNESS1-NEXT: s_mov_b32 s57, 0x3ff00000 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 1 -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 2 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 3 -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s4, 4 -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v41, s5, 5 +; GLOBALNESS1-NEXT: v_readfirstlane_b32 s6, v0 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s6, 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[90:91], -1, 0 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s6, 1 +; GLOBALNESS1-NEXT: s_cselect_b64 s[92:93], -1, 0 +; GLOBALNESS1-NEXT: s_cmp_eq_u32 s6, 1 +; GLOBALNESS1-NEXT: s_cselect_b64 s[94:95], -1, 0 +; GLOBALNESS1-NEXT: s_cmp_eq_u32 s6, 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[96:97], -1, 0 +; GLOBALNESS1-NEXT: s_getpc_b64 s[6:7] +; GLOBALNESS1-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 +; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 +; GLOBALNESS1-NEXT: s_load_dwordx2 s[98:99], s[6:7], 0x0 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v41, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v41, 5 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 +; GLOBALNESS1-NEXT: s_and_b64 s[6:7], s[96:97], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s6, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s6, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_30 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] @@ -132,352 +130,368 @@ ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a34, v2 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a33, v1 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a32, v0 -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_31 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 -; GLOBALNESS1-NEXT: ; Child Loop BB1_15 Depth 2 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] +; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[80:81], s[80:81] op_sel:[0,1] ; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1] -; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 -; GLOBALNESS1-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GLOBALNESS1-NEXT: flat_load_dword v43, v[0:1] -; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GLOBALNESS1-NEXT: flat_load_dword v44, v[0:1] +; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s54 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s53 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s52 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[98:99] +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[82:83], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_8 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s39, 1 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s79, 1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock3 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s39, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 1 +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS1-NEXT: s_cbranch_execnz .LBB1_8 -; GLOBALNESS1-NEXT: s_branch .LBB1_23 +; GLOBALNESS1-NEXT: s_branch .LBB1_8 ; GLOBALNESS1-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], -1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: s_branch .LBB1_23 -; GLOBALNESS1-NEXT: .LBB1_8: ; %Flow16 +; GLOBALNESS1-NEXT: .LBB1_8: ; %Flow17 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s8, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s8, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_24 +; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 -; GLOBALNESS1-NEXT: .LBB1_9: ; %baz.exit.i +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 +; GLOBALNESS1-NEXT: .LBB1_10: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[0:1] -; GLOBALNESS1-NEXT: s_mov_b32 s68, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s70, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s71, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s72, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s73, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s74, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s75, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s76, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s77, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s78, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s79, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s80, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s81, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s82, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s83, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s84, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s85, s69 +; GLOBALNESS1-NEXT: s_mov_b32 s56, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s58, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s59, s57 +; GLOBALNESS1-NEXT: s_mov_b32 s60, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s61, s57 +; GLOBALNESS1-NEXT: s_mov_b32 s62, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s63, s57 +; GLOBALNESS1-NEXT: s_mov_b32 s64, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s65, s57 +; GLOBALNESS1-NEXT: s_mov_b32 s66, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s67, s57 +; GLOBALNESS1-NEXT: s_mov_b32 s68, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s69, s57 +; GLOBALNESS1-NEXT: s_mov_b32 s70, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s71, s57 +; GLOBALNESS1-NEXT: s_mov_b32 s72, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s73, s57 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[96:97], 0, v0 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[70:71], s[96:97] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 -; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[44:45], 0, v0 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[56:57], s[56:57] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], s[58:59], s[58:59] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[4:5], s[60:61], s[60:61] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[6:7], s[62:63], s[62:63] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[8:9], s[64:65], s[64:65] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[10:11], s[66:67], s[66:67] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[12:13], s[68:69], s[68:69] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[14:15], s[70:71], s[70:71] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[16:17], s[72:73], s[72:73] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[18:19], s[74:75], s[74:75] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[20:21], s[76:77], s[76:77] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[22:23], s[78:79], s[78:79] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[24:25], s[80:81], s[80:81] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[82:83], s[82:83] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[84:85], s[84:85] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[86:87], s[86:87] op_sel:[0,1] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[58:59], s[44:45] +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_27 +; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v41, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v41, 1 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 -; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[90:91], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_13 +; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[44:45], off -; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc -; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[42:43] -; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 -; GLOBALNESS1-NEXT: s_branch .LBB1_15 -; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow7 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[46:47], 0, v[0:1] +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[48:49], 0, v2 +; GLOBALNESS1-NEXT: s_branch .LBB1_16 +; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow7 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS1-NEXT: .LBB1_14: ; %bb63.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[86:87] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 -; GLOBALNESS1-NEXT: .LBB1_15: ; %bb44.i +; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[88:89], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_26 +; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[94:95] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 -; GLOBALNESS1-NEXT: ; %bb.16: ; %bb46.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[88:89] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 -; GLOBALNESS1-NEXT: ; %bb.17: ; %bb50.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 -; GLOBALNESS1-NEXT: ; %bb.18: ; %bb3.i.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[40:41] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 -; GLOBALNESS1-NEXT: ; %bb.19: ; %bb6.i.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[56:57] -; GLOBALNESS1-NEXT: .LBB1_20: ; %spam.exit.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[90:91] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 -; GLOBALNESS1-NEXT: ; %bb.21: ; %bb55.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s60, s62, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s61, s63, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[84:85], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_15 +; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[86:87], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_15 +; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[42:43], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_21 +; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[100:101], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_21 +; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[46:47], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[92:93], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_15 +; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS1-NEXT: s_add_u32 s50, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s51, s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[50:51] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] +; GLOBALNESS1-NEXT: s_mov_b32 s12, s54 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s53 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s52 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[98:99] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[50:51] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s54 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s53 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s52 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], a[32:33], off -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 -; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[98:99] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[48:49] +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 +; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS1-NEXT: s_branch .LBB1_13 -; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock +; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], v[42:43], off +; GLOBALNESS1-NEXT: s_branch .LBB1_14 +; GLOBALNESS1-NEXT: .LBB1_24: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s39, 0 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 -; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_10 +; GLOBALNESS1-NEXT: .LBB1_25: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GLOBALNESS1-NEXT: s_branch .LBB1_3 -; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow14 +; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow14 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b32 s36, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s37, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s38, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s39, s93 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[40:41] -; GLOBALNESS1-NEXT: s_mov_b32 s40, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s41, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s42, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s43, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s44, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s45, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s46, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s47, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s48, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s49, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s50, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s51, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s52, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s53, s93 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[4:5], s[40:41], s[40:41] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[6:7], s[42:43], s[42:43] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[8:9], s[44:45], s[44:45] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[10:11], s[46:47], s[46:47] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[12:13], s[48:49], s[48:49] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[14:15], s[50:51], s[50:51] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[16:17], s[52:53], s[52:53] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[18:19], s[54:55], s[54:55] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[20:21], s[56:57], s[56:57] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[22:23], s[58:59], s[58:59] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[24:25], s[60:61], s[60:61] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[6:7] -; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[4:5] -; GLOBALNESS1-NEXT: s_mov_b32 s39, s75 -; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[72:73] -; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow15 +; GLOBALNESS1-NEXT: s_mov_b32 s60, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s61, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s62, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s63, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s64, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s65, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s66, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s67, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s68, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s69, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s70, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s71, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s72, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s73, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s74, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s75, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s76, s81 +; GLOBALNESS1-NEXT: s_mov_b32 s77, s81 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], s[62:63], s[62:63] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[4:5], s[64:65], s[64:65] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[6:7], s[66:67], s[66:67] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[8:9], s[68:69], s[68:69] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[10:11], s[70:71], s[70:71] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[12:13], s[72:73], s[72:73] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[14:15], s[74:75], s[74:75] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[16:17], s[76:77], s[76:77] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[18:19], s[78:79], s[78:79] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[20:21], s[80:81], s[80:81] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[22:23], s[82:83], s[82:83] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[24:25], s[84:85], s[84:85] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[86:87], s[86:87] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[88:89], s[88:89] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[90:91], s[90:91] op_sel:[0,1] +; GLOBALNESS1-NEXT: .LBB1_27: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[70:71] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[58:59] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[44:45] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i +; GLOBALNESS1-NEXT: ; %bb.28: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v41, 2 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v41, 3 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i +; GLOBALNESS1-NEXT: s_and_b64 s[6:7], s[94:95], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s6, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s6, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_1 +; GLOBALNESS1-NEXT: ; %bb.29: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 -; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i +; GLOBALNESS1-NEXT: .LBB1_30: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 -; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: .LBB1_31: ; %loop.exit.guard +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_32 -; GLOBALNESS1-NEXT: ; %bb.31: ; %bb7.i.i -; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_33 +; GLOBALNESS1-NEXT: ; %bb.32: ; %bb7.i.i +; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s54 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s53 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s52 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS1-NEXT: .LBB1_32: ; %Flow -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_34 -; GLOBALNESS1-NEXT: ; %bb.33: ; %bb11.i.i -; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS1-NEXT: .LBB1_33: ; %Flow +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GLOBALNESS1-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_35 +; GLOBALNESS1-NEXT: ; %bb.34: ; %bb11.i.i +; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s54 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s53 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s52 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock +; GLOBALNESS1-NEXT: .LBB1_35: ; %UnifiedUnreachableBlock ; ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb -; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] +; GLOBALNESS0-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, v0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v44, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v44, off +; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v0, v44, s[36:37] +; GLOBALNESS0-NEXT: global_load_dword v0, v42, s[76:77] ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[4:5] -; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 -; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, 0x40994400 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s38, 0 -; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[44:45] -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[94:95], s[4:5], -1 +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s78, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 +; GLOBALNESS0-NEXT: s_nop 0 +; GLOBALNESS0-NEXT: s_load_dword s8, s[8:9], 0x20 +; GLOBALNESS0-NEXT: s_cselect_b64 s[82:83], -1, 0 +; GLOBALNESS0-NEXT: s_xor_b64 s[84:85], s[82:83], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[88:89], s[4:5], -1 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_getpc_b64 s[6:7] -; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 -; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_xor_b64 s[86:87], s[4:5], -1 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 -; GLOBALNESS0-NEXT: s_mov_b32 s98, s16 -; GLOBALNESS0-NEXT: s_mov_b64 s[60:61], s[8:9] -; GLOBALNESS0-NEXT: s_mov_b32 s99, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s100, s14 +; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GLOBALNESS0-NEXT: s_xor_b64 s[86:87], s[6:7], -1 +; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s8, 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GLOBALNESS0-NEXT: s_xor_b64 s[88:89], s[6:7], -1 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, 0x40994400 +; GLOBALNESS0-NEXT: s_mov_b32 s50, s16 +; GLOBALNESS0-NEXT: s_mov_b32 s51, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s52, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS0-NEXT: s_mov_b64 s[92:93], 0x80 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS0-NEXT: s_mov_b32 s69, 0x3ff00000 +; GLOBALNESS0-NEXT: s_mov_b64 s[80:81], 0x80 +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[42:43] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[100:101], s[4:5], 0 +; GLOBALNESS0-NEXT: s_mov_b32 s57, 0x3ff00000 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 1 -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 2 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 3 -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 4 -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 5 +; GLOBALNESS0-NEXT: v_readfirstlane_b32 s6, v0 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s6, 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[90:91], -1, 0 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s6, 1 +; GLOBALNESS0-NEXT: s_cselect_b64 s[92:93], -1, 0 +; GLOBALNESS0-NEXT: s_cmp_eq_u32 s6, 1 +; GLOBALNESS0-NEXT: s_cselect_b64 s[94:95], -1, 0 +; GLOBALNESS0-NEXT: s_cmp_eq_u32 s6, 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[96:97], -1, 0 +; GLOBALNESS0-NEXT: s_getpc_b64 s[6:7] +; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 +; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 +; GLOBALNESS0-NEXT: s_load_dwordx2 s[98:99], s[6:7], 0x0 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 5 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 +; GLOBALNESS0-NEXT: s_and_b64 s[6:7], s[96:97], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s6, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s6, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_30 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] @@ -518,287 +532,305 @@ ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a34, v2 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a33, v1 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a32, v0 -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_31 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 -; GLOBALNESS0-NEXT: ; Child Loop BB1_15 Depth 2 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] +; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[80:81], s[80:81] op_sel:[0,1] ; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] -; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 -; GLOBALNESS0-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GLOBALNESS0-NEXT: flat_load_dword v43, v[0:1] -; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GLOBALNESS0-NEXT: flat_load_dword v44, v[0:1] +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s52 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s51 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s50 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[98:99] +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[82:83], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_8 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lt_i32 s39, 1 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s79, 1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock3 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s39, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s79, 1 +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS0-NEXT: s_cbranch_execnz .LBB1_8 -; GLOBALNESS0-NEXT: s_branch .LBB1_23 +; GLOBALNESS0-NEXT: s_branch .LBB1_8 ; GLOBALNESS0-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], -1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: s_branch .LBB1_23 -; GLOBALNESS0-NEXT: .LBB1_8: ; %Flow16 +; GLOBALNESS0-NEXT: .LBB1_8: ; %Flow17 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s8, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s8, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_24 +; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 -; GLOBALNESS0-NEXT: .LBB1_9: ; %baz.exit.i +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 +; GLOBALNESS0-NEXT: .LBB1_10: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[0:1] -; GLOBALNESS0-NEXT: s_mov_b32 s68, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s70, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s71, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s72, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s73, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s74, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s75, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s76, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s77, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s78, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s79, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s80, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s81, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s82, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s83, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s84, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s85, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s56, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s58, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s59, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s60, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s61, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s62, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s63, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s64, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s65, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s66, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s67, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s68, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s69, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s70, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s71, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s72, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s73, s57 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[96:97], 0, v0 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[70:71], s[96:97] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 -; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[44:45], 0, v0 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[56:57], s[56:57] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[58:59], s[58:59] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[60:61], s[60:61] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[62:63], s[62:63] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[8:9], s[64:65], s[64:65] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[10:11], s[66:67], s[66:67] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[12:13], s[68:69], s[68:69] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[14:15], s[70:71], s[70:71] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[16:17], s[72:73], s[72:73] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[18:19], s[74:75], s[74:75] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[20:21], s[76:77], s[76:77] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[22:23], s[78:79], s[78:79] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[24:25], s[80:81], s[80:81] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[82:83], s[82:83] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[84:85], s[84:85] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[86:87], s[86:87] op_sel:[0,1] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[58:59], s[44:45] +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_27 +; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 1 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 -; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[90:91], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_13 +; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[44:45], off -; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc -; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[42:43] -; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 -; GLOBALNESS0-NEXT: s_branch .LBB1_15 -; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow7 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[46:47], 0, v[0:1] +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[48:49], 0, v2 +; GLOBALNESS0-NEXT: s_branch .LBB1_16 +; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow7 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS0-NEXT: .LBB1_14: ; %bb63.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[86:87] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 -; GLOBALNESS0-NEXT: .LBB1_15: ; %bb44.i +; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[88:89], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_26 +; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[94:95] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 -; GLOBALNESS0-NEXT: ; %bb.16: ; %bb46.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[88:89] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 -; GLOBALNESS0-NEXT: ; %bb.17: ; %bb50.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 -; GLOBALNESS0-NEXT: ; %bb.18: ; %bb3.i.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[40:41] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 -; GLOBALNESS0-NEXT: ; %bb.19: ; %bb6.i.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[56:57] -; GLOBALNESS0-NEXT: .LBB1_20: ; %spam.exit.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[90:91] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 -; GLOBALNESS0-NEXT: ; %bb.21: ; %bb55.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s64, s60, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s65, s61, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[84:85], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_15 +; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[86:87], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_15 +; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[42:43], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_21 +; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[100:101], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_21 +; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[46:47], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[92:93], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_15 +; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS0-NEXT: s_add_u32 s54, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s55, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] +; GLOBALNESS0-NEXT: s_mov_b32 s12, s52 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s51 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s50 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[98:99] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s52 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s51 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s50 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], a[32:33], off -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 -; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[98:99] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[48:49] +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 +; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS0-NEXT: s_branch .LBB1_13 -; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock +; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], v[42:43], off +; GLOBALNESS0-NEXT: s_branch .LBB1_14 +; GLOBALNESS0-NEXT: .LBB1_24: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s39, 0 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s79, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 -; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_10 +; GLOBALNESS0-NEXT: .LBB1_25: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GLOBALNESS0-NEXT: s_branch .LBB1_3 -; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 +; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow14 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b32 s36, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s37, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s38, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s39, s93 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[40:41] -; GLOBALNESS0-NEXT: s_mov_b32 s40, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s41, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s42, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s43, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s44, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s45, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s46, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s47, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s48, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s49, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s50, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s51, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s52, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s53, s93 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[40:41], s[40:41] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[42:43], s[42:43] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[8:9], s[44:45], s[44:45] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[10:11], s[46:47], s[46:47] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[12:13], s[48:49], s[48:49] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[14:15], s[50:51], s[50:51] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[16:17], s[52:53], s[52:53] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[18:19], s[54:55], s[54:55] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[20:21], s[56:57], s[56:57] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[22:23], s[58:59], s[58:59] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[24:25], s[60:61], s[60:61] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[6:7] -; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[4:5] -; GLOBALNESS0-NEXT: s_mov_b32 s39, s75 -; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[72:73] -; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow15 +; GLOBALNESS0-NEXT: s_mov_b32 s60, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s61, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s62, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s63, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s64, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s65, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s66, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s67, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s68, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s69, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s70, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s71, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s72, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s73, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s74, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s75, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s76, s81 +; GLOBALNESS0-NEXT: s_mov_b32 s77, s81 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[62:63], s[62:63] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[64:65], s[64:65] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[66:67], s[66:67] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[8:9], s[68:69], s[68:69] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[10:11], s[70:71], s[70:71] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[12:13], s[72:73], s[72:73] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[14:15], s[74:75], s[74:75] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[16:17], s[76:77], s[76:77] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[18:19], s[78:79], s[78:79] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[20:21], s[80:81], s[80:81] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[22:23], s[82:83], s[82:83] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[24:25], s[84:85], s[84:85] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[86:87], s[86:87] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[88:89], s[88:89] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[90:91], s[90:91] op_sel:[0,1] +; GLOBALNESS0-NEXT: .LBB1_27: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[70:71] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[58:59] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[44:45] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i +; GLOBALNESS0-NEXT: ; %bb.28: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 3 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i +; GLOBALNESS0-NEXT: s_and_b64 s[6:7], s[94:95], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s6, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s6, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_1 +; GLOBALNESS0-NEXT: ; %bb.29: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 -; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i +; GLOBALNESS0-NEXT: .LBB1_30: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 -; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: .LBB1_31: ; %loop.exit.guard +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_32 -; GLOBALNESS0-NEXT: ; %bb.31: ; %bb7.i.i -; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_33 +; GLOBALNESS0-NEXT: ; %bb.32: ; %bb7.i.i +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s52 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s51 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s50 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS0-NEXT: .LBB1_32: ; %Flow -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_34 -; GLOBALNESS0-NEXT: ; %bb.33: ; %bb11.i.i -; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS0-NEXT: .LBB1_33: ; %Flow +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GLOBALNESS0-NEXT: s_cselect_b32 s4, 0, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s4, 1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_35 +; GLOBALNESS0-NEXT: ; %bb.34: ; %bb11.i.i +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s52 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s51 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s50 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock +; GLOBALNESS0-NEXT: .LBB1_35: ; %UnifiedUnreachableBlock bb: store i32 0, i32 addrspace(1)* null, align 4 %tmp4 = load i32, i32 addrspace(1)* %arg1.global, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -146,18 +146,22 @@ ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, 1 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GCN-IR-NEXT: s_cmp_lg_u32 s11, 1 ; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_cselect_b32 s13, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_cmp_lg_u32 s13, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s15, s4, -1 @@ -784,7 +788,6 @@ ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_and_b32 s3, s5, 0xffff ; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 @@ -814,17 +817,22 @@ ; GCN-IR-NEXT: s_cselect_b32 s7, 0, s9 ; GCN-IR-NEXT: s_cselect_b32 s6, 0, s8 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, 1 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 +; GCN-IR-NEXT: s_cmp_lg_u32 s11, 1 +; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s13, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s6 +; GCN-IR-NEXT: s_cmp_lg_u32 s13, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s16 ; GCN-IR-NEXT: s_add_u32 s15, s2, -1 @@ -999,17 +1007,21 @@ ; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec ; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s7, 0, 1 +; GCN-IR-NEXT: s_cmp_lg_u32 s7, 1 ; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s9 -; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s10 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s6 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s12 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 @@ -1446,17 +1458,21 @@ ; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, 0, 1 +; GCN-IR-NEXT: s_cmp_lg_u32 s4, 1 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8 -; GCN-IR-NEXT: s_cbranch_vccz .LBB11_4 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s8 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s12 ; GCN-IR-NEXT: s_add_u32 s2, s10, 0xffffffc4 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -52,14 +52,15 @@ ; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: .LBB1_1: ; %bb9 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] -; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 +; CHECK-NEXT: s_and_b64 s[2:3], s[0:1], exec +; CHECK-NEXT: s_cselect_b32 s2, 0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s2, 1 +; CHECK-NEXT: s_cbranch_scc1 .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb11 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -145,18 +145,22 @@ ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, 1 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GCN-IR-NEXT: s_cmp_lg_u32 s11, 1 ; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 -; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_cselect_b32 s13, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s12 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_cmp_lg_u32 s13, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 @@ -827,17 +831,21 @@ ; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec ; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s7, 0, 1 +; GCN-IR-NEXT: s_cmp_lg_u32 s7, 1 ; GCN-IR-NEXT: s_mov_b32 s7, 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB6_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s9 -; GCN-IR-NEXT: s_cbranch_vccz .LBB6_4 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s10 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s6 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB6_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s12 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 @@ -1012,17 +1020,21 @@ ; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, 0, 1 +; GCN-IR-NEXT: s_cmp_lg_u32 s4, 1 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s9 -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, 1 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s10 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 1 +; GCN-IR-NEXT: s_cbranch_scc0 .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s12 ; GCN-IR-NEXT: s_add_u32 s8, s8, 0xffffffc4 diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll --- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll @@ -5,12 +5,12 @@ ; CHECK-LABEL: icmp_test: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_cmp_eq_u16_e64 s[0:1], 0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: ds_store_b32 v1, v0 +; CHECK-NEXT: s_cselect_b32 s0, 0, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: ds_store_b32 v0, v1 ; CHECK-NEXT: s_endpgm entry: %icmp.intr = tail call i64 @llvm.amdgcn.icmp.i64.i16(i16 0, i16 0, i32 32) @@ -24,16 +24,16 @@ ; CHECK-LABEL: fcmp_test: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_b32 s0, s[0:1], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; CHECK-NEXT: v_cmp_le_f16_e64 s[0:1], s0, s1 ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_cselect_b32 s0, 0, 1 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: ds_store_b32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: ds_store_b32 v0, v1 ; CHECK-NEXT: s_endpgm entry: %fcmp.intr = tail call i64 @llvm.amdgcn.fcmp.i64.f16(half %x, half %y, i32 5) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1844,13 +1844,19 @@ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 -; GFX9-W64-NEXT: s_branch .LBB31_2 +; GFX9-W64-NEXT: s_branch .LBB31_3 ; GFX9-W64-NEXT: .LBB31_1: ; %body -; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 +; GFX9-W64-NEXT: ; in Loop: Header=BB31_3 Depth=1 ; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 -; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 -; GFX9-W64-NEXT: .LBB31_2: ; %loop +; GFX9-W64-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-W64-NEXT: .LBB31_2: ; %Flow +; GFX9-W64-NEXT: ; in Loop: Header=BB31_3 Depth=1 +; GFX9-W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-W64-NEXT: s_cselect_b32 s2, 0, 1 +; GFX9-W64-NEXT: s_cmp_lg_u32 s2, 1 +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB31_5 +; GFX9-W64-NEXT: .LBB31_3: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 @@ -1859,10 +1865,12 @@ ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 -; GFX9-W64-NEXT: ; %bb.3: +; GFX9-W64-NEXT: ; %bb.4: ; in Loop: Header=BB31_3 Depth=1 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], -1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-W64-NEXT: ; implicit-def: $vgpr8 -; GFX9-W64-NEXT: .LBB31_4: ; %break +; GFX9-W64-NEXT: s_branch .LBB31_2 +; GFX9-W64-NEXT: .LBB31_5: ; %break ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: ; return to shader part epilog @@ -1875,14 +1883,20 @@ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_branch .LBB31_2 +; GFX10-W32-NEXT: s_branch .LBB31_3 ; GFX10-W32-NEXT: .p2align 6 ; GFX10-W32-NEXT: .LBB31_1: ; %body -; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 +; GFX10-W32-NEXT: ; in Loop: Header=BB31_3 Depth=1 ; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 -; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 -; GFX10-W32-NEXT: .LBB31_2: ; %loop +; GFX10-W32-NEXT: s_mov_b32 s1, 0 +; GFX10-W32-NEXT: .LBB31_2: ; %Flow +; GFX10-W32-NEXT: ; in Loop: Header=BB31_3 Depth=1 +; GFX10-W32-NEXT: s_and_b32 s1, s1, exec_lo +; GFX10-W32-NEXT: s_cselect_b32 s1, 0, 1 +; GFX10-W32-NEXT: s_cmp_lg_u32 s1, 1 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB31_5 +; GFX10-W32-NEXT: .LBB31_3: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -1891,10 +1905,12 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 -; GFX10-W32-NEXT: ; %bb.3: +; GFX10-W32-NEXT: ; %bb.4: ; in Loop: Header=BB31_3 Depth=1 +; GFX10-W32-NEXT: s_mov_b32 s1, -1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8 -; GFX10-W32-NEXT: .LBB31_4: ; %break +; GFX10-W32-NEXT: s_branch .LBB31_2 +; GFX10-W32-NEXT: .LBB31_5: ; %break ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 @@ -2134,15 +2150,21 @@ ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf -; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 -; GFX9-W64-NEXT: s_branch .LBB35_4 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: s_branch .LBB35_3 ; GFX9-W64-NEXT: .LBB35_2: +; GFX9-W64-NEXT: s_mov_b64 s[0:1], -1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9-W64-NEXT: .LBB35_3: ; %if +; GFX9-W64-NEXT: .LBB35_3: ; %Flow +; GFX9-W64-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-W64-NEXT: s_cselect_b32 s0, 0, 1 +; GFX9-W64-NEXT: s_cmp_lg_u32 s0, 1 +; GFX9-W64-NEXT: s_cbranch_scc1 .LBB35_5 +; GFX9-W64-NEXT: ; %bb.4: ; %if ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf -; GFX9-W64-NEXT: .LBB35_4: ; %end +; GFX9-W64-NEXT: .LBB35_5: ; %end ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX9-W64-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen @@ -2160,15 +2182,21 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 -; GFX10-W32-NEXT: s_branch .LBB35_4 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_branch .LBB35_3 ; GFX10-W32-NEXT: .LBB35_2: +; GFX10-W32-NEXT: s_mov_b32 s0, -1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX10-W32-NEXT: .LBB35_3: ; %if +; GFX10-W32-NEXT: .LBB35_3: ; %Flow +; GFX10-W32-NEXT: s_and_b32 s0, s0, exec_lo +; GFX10-W32-NEXT: s_cselect_b32 s0, 0, 1 +; GFX10-W32-NEXT: s_cmp_lg_u32 s0, 1 +; GFX10-W32-NEXT: s_cbranch_scc1 .LBB35_5 +; GFX10-W32-NEXT: ; %bb.4: ; %if ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: .LBB35_4: ; %end +; GFX10-W32-NEXT: .LBB35_5: ; %end ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen