Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -958,7 +958,6 @@ } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { - addPass(createLICMPass()); addPass(createSeparateConstOffsetFromGEPPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. @@ -1033,6 +1032,11 @@ // TODO: May want to move later or split into an early and late one. addPass(createAMDGPUCodeGenPreparePass()); } + + // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may + // have expanded. + if (TM.getOptLevel() > CodeGenOpt::Less) + addPass(createLICMPass()); } TargetPassConfig::addIRPasses(); Index: llvm/test/CodeGen/AMDGPU/idiv-licm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -6,89 +6,93 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s4, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4 +; GFX9-NEXT: s_add_i32 s4, s6, s4 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-NEXT: .LBB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_not_b32 s9, s6 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 -; GFX9-NEXT: s_mul_i32 s9, s4, s9 -; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_add_i32 s7, s2, s7 -; GFX9-NEXT: s_add_i32 s9, s2, s9 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 -; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_cselect_b32 s7, s9, s7 -; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 -; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_add_u32 s2, s2, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_not_b32 s10, s7 +; GFX9-NEXT: s_mul_i32 s9, s5, s7 +; GFX9-NEXT: s_mul_i32 s10, s5, s10 +; GFX9-NEXT: s_add_i32 s11, s7, 1 +; GFX9-NEXT: s_sub_i32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s10, s8, s10 +; GFX9-NEXT: s_cmp_ge_u32 s9, s5 +; GFX9-NEXT: s_cselect_b32 s11, s11, s7 +; GFX9-NEXT: s_cselect_b32 s9, s10, s9 +; GFX9-NEXT: s_add_i32 s10, s11, 1 +; GFX9-NEXT: s_cmp_ge_u32 s9, s5 +; GFX9-NEXT: s_cselect_b32 s9, s10, s11 +; GFX9-NEXT: s_add_u32 s10, s0, s2 +; GFX9-NEXT: s_addc_u32 s11, s1, s3 +; GFX9-NEXT: s_add_i32 s8, s8, 1 +; GFX9-NEXT: s_add_u32 s6, s6, s4 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s2, s2, 4 ; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, 4 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX9-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x2c +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s5, 0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s2, 0, s5 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mul_i32 s2, s2, s4 +; GFX10-NEXT: s_mul_hi_u32 s6, s4, s2 +; GFX10-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-NEXT: s_add_i32 s4, s4, s6 +; GFX10-NEXT: s_mov_b64 s[6:7], 0 ; GFX10-NEXT: .LBB0_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s6, v0 -; GFX10-NEXT: s_mul_i32 s7, s5, s6 -; GFX10-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX10-NEXT: s_add_i32 s6, s6, s7 -; GFX10-NEXT: s_mul_i32 s7, s3, s6 -; GFX10-NEXT: s_mul_hi_u32 s6, s2, s6 -; GFX10-NEXT: s_add_i32 s6, s6, s7 -; GFX10-NEXT: s_not_b32 s8, s6 -; GFX10-NEXT: s_mul_i32 s7, s5, s6 -; GFX10-NEXT: s_mul_i32 s8, s4, s8 -; GFX10-NEXT: s_add_i32 s7, s2, s7 -; GFX10-NEXT: s_add_i32 s9, s6, 1 -; GFX10-NEXT: s_add_i32 s8, s2, s8 -; GFX10-NEXT: s_cmp_ge_u32 s7, s4 -; GFX10-NEXT: s_cselect_b32 s6, s9, s6 -; GFX10-NEXT: s_cselect_b32 s7, s8, s7 -; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_cmp_ge_u32 s7, s4 -; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_add_u32 s2, s2, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_add_u32 s0, s0, 4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: s_not_b32 s10, s7 +; GFX10-NEXT: s_mul_i32 s9, s5, s7 +; GFX10-NEXT: s_mul_i32 s10, s5, s10 +; GFX10-NEXT: s_sub_i32 s9, s8, s9 +; GFX10-NEXT: s_add_i32 s11, s7, 1 +; GFX10-NEXT: s_add_i32 s10, s8, s10 +; GFX10-NEXT: s_cmp_ge_u32 s9, s5 +; GFX10-NEXT: s_cselect_b32 s11, s11, s7 +; GFX10-NEXT: s_cselect_b32 s9, s10, s9 +; GFX10-NEXT: s_add_i32 s10, s11, 1 +; GFX10-NEXT: s_cmp_ge_u32 s9, s5 +; GFX10-NEXT: s_cselect_b32 s9, s10, s11 +; GFX10-NEXT: s_add_u32 s10, s0, s2 +; GFX10-NEXT: s_addc_u32 s11, s1, s3 +; GFX10-NEXT: s_add_i32 s8, s8, 1 +; GFX10-NEXT: s_add_u32 s6, s6, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_add_u32 s2, s2, 4 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX10-NEXT: global_store_dword v0, v1, s[10:11] ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -96,51 +100,52 @@ ; GFX11-LABEL: udiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[2:3], 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX11-NEXT: s_sub_i32 s5, 0, s4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX11-NEXT: s_sub_i32 s2, 0, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mul_i32 s2, s2, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s6, s4, s2 +; GFX11-NEXT: s_mov_b64 s[2:3], 0 +; GFX11-NEXT: s_add_i32 s4, s4, s6 +; GFX11-NEXT: s_mov_b64 s[6:7], 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB0_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s6, v0 -; GFX11-NEXT: s_mul_i32 s7, s5, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX11-NEXT: s_add_i32 s6, s6, s7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s7, s3, s6 -; GFX11-NEXT: s_mul_hi_u32 s6, s2, s6 -; GFX11-NEXT: s_add_i32 s6, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s8, s6 -; GFX11-NEXT: s_mul_i32 s7, s5, s6 -; GFX11-NEXT: s_mul_i32 s8, s4, s8 -; GFX11-NEXT: s_add_i32 s7, s2, s7 -; GFX11-NEXT: s_add_i32 s9, s6, 1 -; GFX11-NEXT: s_add_i32 s8, s2, s8 -; GFX11-NEXT: s_cmp_ge_u32 s7, s4 -; GFX11-NEXT: s_cselect_b32 s6, s9, s6 -; GFX11-NEXT: s_cselect_b32 s7, s8, s7 -; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_cmp_ge_u32 s7, s4 -; GFX11-NEXT: s_cselect_b32 s6, s8, s6 -; GFX11-NEXT: s_add_u32 s2, s2, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_not_b32 s10, s7 +; GFX11-NEXT: s_mul_i32 s9, s5, s7 +; GFX11-NEXT: s_mul_i32 s10, s5, s10 +; GFX11-NEXT: s_sub_i32 s9, s8, s9 +; GFX11-NEXT: s_add_i32 s11, s7, 1 +; GFX11-NEXT: s_add_i32 s10, s8, s10 +; GFX11-NEXT: s_cmp_ge_u32 s9, s5 +; GFX11-NEXT: s_cselect_b32 s11, s11, s7 +; GFX11-NEXT: s_cselect_b32 s9, s10, s9 +; GFX11-NEXT: s_add_i32 s10, s11, 1 +; GFX11-NEXT: s_cmp_ge_u32 s9, s5 +; GFX11-NEXT: s_cselect_b32 s9, s10, s11 +; GFX11-NEXT: s_add_u32 s10, s0, s2 +; GFX11-NEXT: s_addc_u32 s11, s1, s3 +; GFX11-NEXT: s_add_i32 s8, s8, 1 +; GFX11-NEXT: s_add_u32 s6, s6, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 4 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, 4 -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -165,85 +170,89 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s4, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4 +; GFX9-NEXT: s_add_i32 s4, s6, s4 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-NEXT: .LBB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 -; GFX9-NEXT: s_not_b32 s6, s6 -; GFX9-NEXT: s_mul_i32 s6, s4, s6 -; GFX9-NEXT: s_add_i32 s7, s2, s7 -; GFX9-NEXT: s_add_i32 s6, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 -; GFX9-NEXT: s_cselect_b32 s6, s6, s7 -; GFX9-NEXT: s_sub_i32 s7, s6, s4 -; GFX9-NEXT: s_cmp_ge_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_add_u32 s2, s2, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_not_b32 s10, s7 +; GFX9-NEXT: s_mul_i32 s9, s5, s7 +; GFX9-NEXT: s_mul_i32 s10, s5, s10 +; GFX9-NEXT: s_sub_i32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s10, s8, s10 +; GFX9-NEXT: s_cmp_ge_u32 s9, s5 +; GFX9-NEXT: s_cselect_b32 s9, s10, s9 +; GFX9-NEXT: s_sub_i32 s10, s9, s5 +; GFX9-NEXT: s_cmp_ge_u32 s9, s5 +; GFX9-NEXT: s_cselect_b32 s9, s10, s9 +; GFX9-NEXT: s_add_u32 s10, s0, s2 +; GFX9-NEXT: s_addc_u32 s11, s1, s3 +; GFX9-NEXT: s_add_i32 s8, s8, 1 +; GFX9-NEXT: s_add_u32 s6, s6, s4 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s2, s2, 4 ; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, 4 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX9-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: urem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x2c +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s5, 0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: s_sub_i32 s2, 0, s5 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mul_i32 s2, s2, s4 +; GFX10-NEXT: s_mul_hi_u32 s6, s4, s2 +; GFX10-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-NEXT: s_add_i32 s4, s4, s6 +; GFX10-NEXT: s_mov_b64 s[6:7], 0 ; GFX10-NEXT: .LBB1_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s6, v0 -; GFX10-NEXT: s_mul_i32 s7, s5, s6 -; GFX10-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX10-NEXT: s_add_i32 s6, s6, s7 -; GFX10-NEXT: s_mul_i32 s7, s3, s6 -; GFX10-NEXT: s_mul_hi_u32 s6, s2, s6 -; GFX10-NEXT: s_add_i32 s6, s6, s7 -; GFX10-NEXT: s_not_b32 s7, s6 -; GFX10-NEXT: s_mul_i32 s6, s5, s6 -; GFX10-NEXT: s_mul_i32 s7, s4, s7 -; GFX10-NEXT: s_add_i32 s6, s2, s6 -; GFX10-NEXT: s_add_i32 s7, s2, s7 -; GFX10-NEXT: s_cmp_ge_u32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s6, s7, s6 -; GFX10-NEXT: s_sub_i32 s7, s6, s4 -; GFX10-NEXT: s_cmp_ge_u32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s6, s7, s6 -; GFX10-NEXT: s_add_u32 s2, s2, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: s_not_b32 s9, s7 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_add_u32 s0, s0, 4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: s_mul_i32 s10, s5, s7 +; GFX10-NEXT: s_mul_i32 s9, s5, s9 +; GFX10-NEXT: s_sub_i32 s10, s8, s10 +; GFX10-NEXT: s_add_i32 s9, s8, s9 +; GFX10-NEXT: s_cmp_ge_u32 s10, s5 +; GFX10-NEXT: s_cselect_b32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s10, s9, s5 +; GFX10-NEXT: s_cmp_ge_u32 s9, s5 +; GFX10-NEXT: s_cselect_b32 s9, s10, s9 +; GFX10-NEXT: s_add_u32 s10, s0, s2 +; GFX10-NEXT: s_addc_u32 s11, s1, s3 +; GFX10-NEXT: s_add_i32 s8, s8, 1 +; GFX10-NEXT: s_add_u32 s6, s6, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_add_u32 s2, s2, 4 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX10-NEXT: global_store_dword v0, v1, s[10:11] ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -251,50 +260,51 @@ ; GFX11-LABEL: urem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[2:3], 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX11-NEXT: s_sub_i32 s5, 0, s4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX11-NEXT: s_sub_i32 s2, 0, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mul_i32 s2, s2, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s6, s4, s2 +; GFX11-NEXT: s_mov_b64 s[2:3], 0 +; GFX11-NEXT: s_add_i32 s4, s4, s6 +; GFX11-NEXT: s_mov_b64 s[6:7], 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB1_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s6, v0 -; GFX11-NEXT: s_mul_i32 s7, s5, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX11-NEXT: s_add_i32 s6, s6, s7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s7, s3, s6 -; GFX11-NEXT: s_mul_hi_u32 s6, s2, s6 -; GFX11-NEXT: s_add_i32 s6, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s7, s6 -; GFX11-NEXT: s_mul_i32 s6, s5, s6 -; GFX11-NEXT: s_mul_i32 s7, s4, s7 -; GFX11-NEXT: s_add_i32 s6, s2, s6 -; GFX11-NEXT: s_add_i32 s7, s2, s7 -; GFX11-NEXT: s_cmp_ge_u32 s6, s4 -; GFX11-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-NEXT: s_not_b32 s9, s7 +; GFX11-NEXT: s_mul_i32 s10, s5, s7 +; GFX11-NEXT: s_mul_i32 s9, s5, s9 +; GFX11-NEXT: s_sub_i32 s10, s8, s10 +; GFX11-NEXT: s_add_i32 s9, s8, s9 +; GFX11-NEXT: s_cmp_ge_u32 s10, s5 +; GFX11-NEXT: s_cselect_b32 s9, s9, s10 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s7, s6, s4 -; GFX11-NEXT: s_cmp_ge_u32 s6, s4 -; GFX11-NEXT: s_cselect_b32 s6, s7, s6 -; GFX11-NEXT: s_add_u32 s2, s2, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_sub_i32 s10, s9, s5 +; GFX11-NEXT: s_cmp_ge_u32 s9, s5 +; GFX11-NEXT: s_cselect_b32 s9, s10, s9 +; GFX11-NEXT: s_add_u32 s10, s0, s2 +; GFX11-NEXT: s_addc_u32 s11, s1, s3 +; GFX11-NEXT: s_add_i32 s8, s8, 1 +; GFX11-NEXT: s_add_u32 s6, s6, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 4 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, 4 -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -319,89 +329,90 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_ashr_i32 s3, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s4, s2, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s2, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s6 +; GFX9-NEXT: s_mul_hi_u32 s2, s6, s2 +; GFX9-NEXT: s_add_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s6 -; GFX9-NEXT: s_mul_i32 s7, s6, s3 -; GFX9-NEXT: s_sub_i32 s7, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s2 +; GFX9-NEXT: s_mul_i32 s7, s6, s4 +; GFX9-NEXT: s_sub_i32 s7, s5, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s3 -; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_sub_i32 s9, s7, s4 +; GFX9-NEXT: s_cmp_ge_u32 s7, s4 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cmp_ge_u32 s7, s4 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s2 -; GFX9-NEXT: s_sub_i32 s6, s6, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s6, s6, s3 +; GFX9-NEXT: s_add_i32 s5, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s2, s3, 31 -; GFX10-NEXT: s_add_i32 s3, s3, s2 -; GFX10-NEXT: s_xor_b32 s3, s3, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX10-NEXT: s_sub_i32 s5, 0, s3 +; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_xor_b32 s4, s2, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s5, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mul_i32 s5, s5, s2 +; GFX10-NEXT: s_mul_hi_u32 s6, s2, s5 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_add_i32 s2, s2, s6 ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s6, v0 -; GFX10-NEXT: s_mul_i32 s7, s5, s6 -; GFX10-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX10-NEXT: s_add_i32 s6, s6, s7 -; GFX10-NEXT: s_mul_hi_u32 s6, s4, s6 -; GFX10-NEXT: s_mul_i32 s7, s6, s3 +; GFX10-NEXT: s_mul_hi_u32 s6, s5, s2 +; GFX10-NEXT: s_mul_i32 s7, s6, s4 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_sub_i32 s7, s4, s7 -; GFX10-NEXT: s_sub_i32 s9, s7, s3 -; GFX10-NEXT: s_cmp_ge_u32 s7, s3 +; GFX10-NEXT: s_sub_i32 s7, s5, s7 +; GFX10-NEXT: s_sub_i32 s9, s7, s4 +; GFX10-NEXT: s_cmp_ge_u32 s7, s4 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_cmp_ge_u32 s7, s3 +; GFX10-NEXT: s_cmp_ge_u32 s7, s4 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_add_i32 s4, s4, 1 -; GFX10-NEXT: s_xor_b32 s6, s6, s2 -; GFX10-NEXT: s_sub_i32 s6, s6, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: s_add_i32 s5, s5, 1 +; GFX10-NEXT: s_xor_b32 s6, s6, s3 +; GFX10-NEXT: s_sub_i32 s6, s6, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s5, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -409,54 +420,53 @@ ; GFX11-LABEL: sdiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_ashr_i32 s2, s3, 31 +; GFX11-NEXT: s_ashr_i32 s3, s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s3, s3, s2 -; GFX11-NEXT: s_xor_b32 s3, s3, s2 +; GFX11-NEXT: s_add_i32 s2, s2, s3 +; GFX11-NEXT: s_xor_b32 s4, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX11-NEXT: s_sub_i32 s5, 0, s3 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX11-NEXT: s_sub_i32 s5, 0, s4 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s5, s5, s2 +; GFX11-NEXT: s_mul_hi_u32 s6, s2, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_add_i32 s2, s2, s6 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s6, v0 -; GFX11-NEXT: s_mul_i32 s7, s5, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX11-NEXT: s_add_i32 s6, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s6, s4, s6 -; GFX11-NEXT: s_mul_i32 s7, s6, s3 +; GFX11-NEXT: s_mul_hi_u32 s6, s5, s2 +; GFX11-NEXT: s_mul_i32 s7, s6, s4 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_sub_i32 s7, s4, s7 +; GFX11-NEXT: s_sub_i32 s7, s5, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s9, s7, s3 -; GFX11-NEXT: s_cmp_ge_u32 s7, s3 +; GFX11-NEXT: s_sub_i32 s9, s7, s4 +; GFX11-NEXT: s_cmp_ge_u32 s7, s4 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_cmp_ge_u32 s7, s3 +; GFX11-NEXT: s_cmp_ge_u32 s7, s4 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 -; GFX11-NEXT: s_add_i32 s4, s4, 1 -; GFX11-NEXT: s_xor_b32 s6, s6, s2 +; GFX11-NEXT: s_add_i32 s5, s5, 1 +; GFX11-NEXT: s_xor_b32 s6, s6, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s6, s2 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: s_sub_i32 s6, s6, s3 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s5, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -482,39 +492,39 @@ ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s3, s2, 31 ; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_xor_b32 s3, s2, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s2, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s5 +; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2 +; GFX9-NEXT: s_add_i32 s2, s5, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s6, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX9-NEXT: s_mul_i32 s5, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s3, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s2 -; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s2 +; GFX9-NEXT: s_mul_i32 s5, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s4, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s2 -; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s3, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] +; GFX9-NEXT: s_add_i32 s4, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -522,40 +532,41 @@ ; GFX10-LABEL: srem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_xor_b32 s2, s2, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: s_sub_i32 s4, 0, s2 +; GFX10-NEXT: s_xor_b32 s3, s2, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX10-NEXT: s_sub_i32 s4, 0, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mul_i32 s4, s4, s2 +; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s2, s2, s5 ; GFX10-NEXT: .LBB3_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s5, v0 -; GFX10-NEXT: s_mul_i32 s6, s4, s5 -; GFX10-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX10-NEXT: s_add_i32 s5, s5, s6 -; GFX10-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX10-NEXT: s_mul_i32 s5, s5, s2 -; GFX10-NEXT: s_sub_i32 s5, s3, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s2 -; GFX10-NEXT: s_cmp_ge_u32 s5, s2 +; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2 +; GFX10-NEXT: s_mul_i32 s5, s5, s3 +; GFX10-NEXT: s_sub_i32 s5, s4, s5 +; GFX10-NEXT: s_sub_i32 s6, s5, s3 +; GFX10-NEXT: s_cmp_ge_u32 s5, s3 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s2 -; GFX10-NEXT: s_cmp_ge_u32 s5, s2 +; GFX10-NEXT: s_sub_i32 s6, s5, s3 +; GFX10-NEXT: s_cmp_ge_u32 s5, s3 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s3, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -565,47 +576,47 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_ashr_i32 s3, s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, s3 -; GFX11-NEXT: s_xor_b32 s2, s2, s3 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX11-NEXT: s_sub_i32 s4, 0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_xor_b32 s3, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX11-NEXT: s_sub_i32 s4, 0, s3 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s4, s4, s2 +; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s2, s2, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB3_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_mul_i32 s6, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX11-NEXT: s_add_i32 s5, s5, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX11-NEXT: s_mul_i32 s5, s5, s2 +; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2 +; GFX11-NEXT: s_mul_i32 s5, s5, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s5, s3, s5 -; GFX11-NEXT: s_sub_i32 s6, s5, s2 -; GFX11-NEXT: s_cmp_ge_u32 s5, s2 +; GFX11-NEXT: s_sub_i32 s5, s4, s5 +; GFX11-NEXT: s_sub_i32 s6, s5, s3 +; GFX11-NEXT: s_cmp_ge_u32 s5, s3 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s5, s2 -; GFX11-NEXT: s_cmp_ge_u32 s5, s2 +; GFX11-NEXT: s_sub_i32 s6, s5, s3 +; GFX11-NEXT: s_cmp_ge_u32 s5, s3 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 -; GFX11-NEXT: s_add_i32 s3, s3, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 -; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: s_add_i32 s4, s4, 1 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -636,7 +647,7 @@ ; GFX9-NEXT: s_movk_i32 s6, 0x400 ; GFX9-NEXT: s_mov_b32 s7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -670,7 +681,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s0, s4, 0xffff ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -704,7 +715,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s0, s4, 0xffff ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -762,7 +773,7 @@ ; GFX9-NEXT: s_movk_i32 s7, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX9-NEXT: s_and_b32 s6, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: .LBB5_1: ; %bb3 @@ -796,7 +807,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s1, s4, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: .LBB5_1: ; %bb3 @@ -828,7 +839,7 @@ ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s1, s4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v2 Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -452,18 +452,8 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction ; GCN-O1-OPTS-NEXT: SROA -; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-OPTS-NEXT: Function Alias Analysis Results -; GCN-O1-OPTS-NEXT: Memory SSA ; GCN-O1-OPTS-NEXT: Natural Loop Information -; GCN-O1-OPTS-NEXT: Canonicalize natural loops -; GCN-O1-OPTS-NEXT: LCSSA Verifier -; GCN-O1-OPTS-NEXT: Loop-Closed SSA Form Pass ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis -; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis -; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis -; GCN-O1-OPTS-NEXT: Loop Pass Manager -; GCN-O1-OPTS-NEXT: Loop Invariant Code Motion ; GCN-O1-OPTS-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis ; GCN-O1-OPTS-NEXT: Straight line strength reduction @@ -474,6 +464,7 @@ ; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR optimizations +; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Canonicalize natural loops ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis ; GCN-O1-OPTS-NEXT: Loop Pass Manager @@ -753,18 +744,8 @@ ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction ; GCN-O2-NEXT: SROA -; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: Natural Loop Information -; GCN-O2-NEXT: Canonicalize natural loops -; GCN-O2-NEXT: LCSSA Verifier -; GCN-O2-NEXT: Loop-Closed SSA Form Pass ; GCN-O2-NEXT: Scalar Evolution Analysis -; GCN-O2-NEXT: Lazy Branch Probability Analysis -; GCN-O2-NEXT: Lazy Block Frequency Analysis -; GCN-O2-NEXT: Loop Pass Manager -; GCN-O2-NEXT: Loop Invariant Code Motion ; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Straight line strength reduction @@ -775,8 +756,17 @@ ; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU IR optimizations +; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O2-NEXT: Function Alias Analysis Results +; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: Canonicalize natural loops +; GCN-O2-NEXT: LCSSA Verifier +; GCN-O2-NEXT: Loop-Closed SSA Form Pass ; GCN-O2-NEXT: Scalar Evolution Analysis +; GCN-O2-NEXT: Lazy Branch Probability Analysis +; GCN-O2-NEXT: Lazy Block Frequency Analysis +; GCN-O2-NEXT: Loop Pass Manager +; GCN-O2-NEXT: Loop Invariant Code Motion ; GCN-O2-NEXT: Loop Pass Manager ; GCN-O2-NEXT: Canonicalize Freeze Instructions in Loops ; GCN-O2-NEXT: Induction Variable Users @@ -1057,23 +1047,16 @@ ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction ; GCN-O3-NEXT: SROA -; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: Natural Loop Information -; GCN-O3-NEXT: Canonicalize natural loops -; GCN-O3-NEXT: LCSSA Verifier -; GCN-O3-NEXT: Loop-Closed SSA Form Pass ; GCN-O3-NEXT: Scalar Evolution Analysis -; GCN-O3-NEXT: Lazy Branch Probability Analysis -; GCN-O3-NEXT: Lazy Block Frequency Analysis -; GCN-O3-NEXT: Loop Pass Manager -; GCN-O3-NEXT: Loop Invariant Code Motion ; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Straight line strength reduction +; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Memory Dependence Analysis +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Global Value Numbering ; GCN-O3-NEXT: Scalar Evolution Analysis @@ -1083,8 +1066,16 @@ ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU IR optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O3-NEXT: Function Alias Analysis Results +; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: Canonicalize natural loops +; GCN-O3-NEXT: LCSSA Verifier +; GCN-O3-NEXT: Loop-Closed SSA Form Pass ; GCN-O3-NEXT: Scalar Evolution Analysis +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis +; GCN-O3-NEXT: Loop Pass Manager +; GCN-O3-NEXT: Loop Invariant Code Motion ; GCN-O3-NEXT: Loop Pass Manager ; GCN-O3-NEXT: Canonicalize Freeze Instructions in Loops ; GCN-O3-NEXT: Induction Variable Users Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -215,7 +215,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[22:23] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow6 +; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end @@ -445,12 +445,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow3 +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 -; GCN-IR-NEXT: .LBB1_6: ; %Flow4 +; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v6 @@ -1052,7 +1052,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[22:23] ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 -; GCN-IR-NEXT: .LBB9_4: ; %Flow3 +; GCN-IR-NEXT: .LBB9_4: ; %Flow4 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end @@ -1256,7 +1256,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3 -; GCN-IR-NEXT: .LBB10_4: ; %Flow5 +; GCN-IR-NEXT: .LBB10_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GCN-IR-NEXT: .LBB10_5: ; %udiv-end @@ -1455,12 +1455,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB11_5: ; %Flow3 +; GCN-IR-NEXT: .LBB11_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0 -; GCN-IR-NEXT: .LBB11_6: ; %Flow4 +; GCN-IR-NEXT: .LBB11_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3 @@ -1651,12 +1651,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow3 +; GCN-IR-NEXT: .LBB12_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0 -; GCN-IR-NEXT: .LBB12_6: ; %Flow4 +; GCN-IR-NEXT: .LBB12_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3 @@ -1748,12 +1748,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB13_5: ; %Flow3 +; GCN-IR-NEXT: .LBB13_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: .LBB13_6: ; %Flow4 +; GCN-IR-NEXT: .LBB13_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v3, v6, v1 Index: llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -166,148 +166,42 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { ; SI-LABEL: loop_land_info_assert: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0x0 -; SI-NEXT: v_bfrev_b32_e32 v0, 44 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dword s2, s[0:1], 0xa ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s2, 1 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s3, 4 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s3, 3 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[2:3] -; SI-NEXT: v_cmp_lt_f32_e64 s[8:9], |s8|, v0 -; SI-NEXT: s_and_b64 s[2:3], exec, s[6:7] -; SI-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_and_b64 s[6:7], exec, s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, 3 -; SI-NEXT: s_branch .LBB3_3 -; SI-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB3_2: ; %Flow -; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_and_b64 vcc, exec, s[14:15] -; SI-NEXT: s_cbranch_vccnz .LBB3_8 -; SI-NEXT: .LBB3_3: ; %while.cond -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_mov_b64 s[12:13], -1 -; SI-NEXT: s_mov_b64 s[8:9], -1 -; SI-NEXT: s_mov_b64 s[14:15], -1 -; SI-NEXT: s_mov_b64 vcc, s[2:3] -; SI-NEXT: s_cbranch_vccz .LBB3_2 -; SI-NEXT: ; %bb.4: ; %convex.exit -; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_mov_b64 vcc, s[4:5] -; SI-NEXT: s_cbranch_vccz .LBB3_1 -; SI-NEXT: ; %bb.5: ; %if.end -; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_mov_b64 vcc, s[6:7] -; SI-NEXT: s_cbranch_vccz .LBB3_7 -; SI-NEXT: ; %bb.6: ; %if.else -; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[14:15], 0 -; SI-NEXT: .LBB3_7: ; %Flow6 -; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_mov_b64 s[12:13], 0 -; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 -; SI-NEXT: s_branch .LBB3_2 -; SI-NEXT: .LBB3_8: ; %loop.exit.guard4 -; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; SI-NEXT: s_and_b64 vcc, exec, s[12:13] -; SI-NEXT: s_cbranch_vccz .LBB3_3 -; SI-NEXT: ; %bb.9: ; %loop.exit.guard -; SI-NEXT: s_and_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccz .LBB3_13 -; SI-NEXT: ; %bb.10: ; %for.cond.preheader +; SI-NEXT: s_cmp_lt_i32 s2, 4 +; SI-NEXT: s_cbranch_scc1 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %for.cond.preheader ; SI-NEXT: s_load_dword s0, s[0:1], 0xc ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmpk_lt_i32 s0, 0x3e8 -; SI-NEXT: s_cbranch_scc0 .LBB3_13 -; SI-NEXT: ; %bb.11: ; %for.body +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.2: ; %for.body ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: .LBB3_12: ; %self.loop +; SI-NEXT: .LBB3_3: ; %self.loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB3_12 -; SI-NEXT: .LBB3_13: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz .LBB3_3 +; SI-NEXT: .LBB3_4: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: loop_land_info_assert: ; FLAT: ; %bb.0: ; %entry -; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s8, s[0:1], 0x0 -; FLAT-NEXT: v_bfrev_b32_e32 v0, 44 -; FLAT-NEXT: s_mov_b32 s11, 0xf000 -; FLAT-NEXT: s_mov_b32 s10, -1 +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x28 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_cmp_lt_i32 s2, 1 -; FLAT-NEXT: s_cselect_b64 s[4:5], -1, 0 -; FLAT-NEXT: s_cmp_lt_i32 s3, 4 -; FLAT-NEXT: s_cselect_b64 s[6:7], -1, 0 -; FLAT-NEXT: s_cmp_gt_i32 s3, 3 -; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0 -; FLAT-NEXT: s_and_b64 s[4:5], s[4:5], s[2:3] -; FLAT-NEXT: v_cmp_lt_f32_e64 s[8:9], |s8|, v0 -; FLAT-NEXT: s_and_b64 s[2:3], exec, s[6:7] -; FLAT-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; FLAT-NEXT: s_and_b64 s[6:7], exec, s[8:9] -; FLAT-NEXT: v_mov_b32_e32 v0, 3 -; FLAT-NEXT: s_branch .LBB3_3 -; FLAT-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_mov_b64 s[8:9], 0 -; FLAT-NEXT: .LBB3_2: ; %Flow -; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15] -; FLAT-NEXT: s_cbranch_vccnz .LBB3_8 -; FLAT-NEXT: .LBB3_3: ; %while.cond -; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_mov_b64 s[12:13], -1 -; FLAT-NEXT: s_mov_b64 s[8:9], -1 -; FLAT-NEXT: s_mov_b64 s[14:15], -1 -; FLAT-NEXT: s_mov_b64 vcc, s[2:3] -; FLAT-NEXT: s_cbranch_vccz .LBB3_2 -; FLAT-NEXT: ; %bb.4: ; %convex.exit -; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_mov_b64 vcc, s[4:5] -; FLAT-NEXT: s_cbranch_vccz .LBB3_1 -; FLAT-NEXT: ; %bb.5: ; %if.end -; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_mov_b64 vcc, s[6:7] -; FLAT-NEXT: s_cbranch_vccz .LBB3_7 -; FLAT-NEXT: ; %bb.6: ; %if.else -; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; FLAT-NEXT: s_waitcnt vmcnt(0) -; FLAT-NEXT: s_mov_b64 s[14:15], 0 -; FLAT-NEXT: .LBB3_7: ; %Flow6 -; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_mov_b64 s[12:13], 0 -; FLAT-NEXT: ; implicit-def: $sgpr8_sgpr9 -; FLAT-NEXT: s_branch .LBB3_2 -; FLAT-NEXT: .LBB3_8: ; %loop.exit.guard4 -; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; FLAT-NEXT: s_and_b64 vcc, exec, s[12:13] -; FLAT-NEXT: s_cbranch_vccz .LBB3_3 -; FLAT-NEXT: ; %bb.9: ; %loop.exit.guard -; FLAT-NEXT: s_and_b64 vcc, exec, s[8:9] -; FLAT-NEXT: s_cbranch_vccz .LBB3_13 -; FLAT-NEXT: ; %bb.10: ; %for.cond.preheader +; FLAT-NEXT: s_cmp_lt_i32 s2, 4 +; FLAT-NEXT: s_cbranch_scc1 .LBB3_4 +; FLAT-NEXT: ; %bb.1: ; %for.cond.preheader ; FLAT-NEXT: s_load_dword s0, s[0:1], 0x30 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmpk_lt_i32 s0, 0x3e8 -; FLAT-NEXT: s_cbranch_scc0 .LBB3_13 -; FLAT-NEXT: ; %bb.11: ; %for.body +; FLAT-NEXT: s_cbranch_scc0 .LBB3_4 +; FLAT-NEXT: ; %bb.2: ; %for.body ; FLAT-NEXT: s_and_b64 vcc, exec, 0 -; FLAT-NEXT: .LBB3_12: ; %self.loop +; FLAT-NEXT: .LBB3_3: ; %self.loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_mov_b64 vcc, vcc -; FLAT-NEXT: s_cbranch_vccz .LBB3_12 -; FLAT-NEXT: .LBB3_13: ; %DummyReturnBlock +; FLAT-NEXT: s_cbranch_vccz .LBB3_3 +; FLAT-NEXT: .LBB3_4: ; %DummyReturnBlock ; FLAT-NEXT: s_endpgm entry: %cmp = icmp sgt i32 %c0, 0 Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -187,7 +187,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow6 +; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end @@ -421,12 +421,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow3 +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7 ; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 -; GCN-IR-NEXT: .LBB1_6: ; %Flow4 +; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v7, v2, v9 ; GCN-IR-NEXT: v_mul_hi_u32 v8, v2, v6 @@ -1088,7 +1088,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 -; GCN-IR-NEXT: .LBB8_4: ; %Flow6 +; GCN-IR-NEXT: .LBB8_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end @@ -1239,7 +1239,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 -; GCN-IR-NEXT: .LBB9_4: ; %Flow3 +; GCN-IR-NEXT: .LBB9_4: ; %Flow4 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end @@ -1446,7 +1446,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3 -; GCN-IR-NEXT: .LBB10_4: ; %Flow5 +; GCN-IR-NEXT: .LBB10_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[2:3] ; GCN-IR-NEXT: .LBB10_5: ; %udiv-end @@ -1643,12 +1643,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB11_5: ; %Flow3 +; GCN-IR-NEXT: .LBB11_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 -; GCN-IR-NEXT: .LBB11_6: ; %Flow4 +; GCN-IR-NEXT: .LBB11_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 ; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 @@ -1837,12 +1837,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow3 +; GCN-IR-NEXT: .LBB12_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 ; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 -; GCN-IR-NEXT: .LBB12_6: ; %Flow4 +; GCN-IR-NEXT: .LBB12_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 @@ -1940,12 +1940,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB13_5: ; %Flow3 +; GCN-IR-NEXT: .LBB13_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: .LBB13_6: ; %Flow4 +; GCN-IR-NEXT: .LBB13_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[6:7], 15 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -188,7 +188,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow6 +; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end @@ -387,12 +387,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow3 +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 -; GCN-IR-NEXT: .LBB1_6: ; %Flow4 +; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v4 @@ -855,7 +855,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 -; GCN-IR-NEXT: .LBB7_4: ; %Flow3 +; GCN-IR-NEXT: .LBB7_4: ; %Flow4 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB7_5: ; %udiv-end @@ -1039,7 +1039,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 -; GCN-IR-NEXT: .LBB8_4: ; %Flow5 +; GCN-IR-NEXT: .LBB8_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end @@ -1223,12 +1223,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB9_5: ; %Flow3 +; GCN-IR-NEXT: .LBB9_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB9_6: ; %Flow4 +; GCN-IR-NEXT: .LBB9_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 @@ -1309,12 +1309,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB10_5: ; %Flow3 +; GCN-IR-NEXT: .LBB10_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB10_6: ; %Flow4 +; GCN-IR-NEXT: .LBB10_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 @@ -1483,7 +1483,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_3 -; GCN-IR-NEXT: .LBB11_4: ; %Flow5 +; GCN-IR-NEXT: .LBB11_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB11_5: ; %udiv-end @@ -1659,12 +1659,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow3 +; GCN-IR-NEXT: .LBB12_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB12_6: ; %Flow4 +; GCN-IR-NEXT: .LBB12_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -187,7 +187,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow6 +; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end @@ -396,12 +396,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow3 +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-IR-NEXT: .LBB1_6: ; %Flow4 +; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v5, v2, v7 ; GCN-IR-NEXT: v_mul_hi_u32 v6, v2, v4 @@ -867,7 +867,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB6_3 -; GCN-IR-NEXT: .LBB6_4: ; %Flow5 +; GCN-IR-NEXT: .LBB6_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-IR-NEXT: .LBB6_5: ; %udiv-end @@ -1049,7 +1049,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 -; GCN-IR-NEXT: .LBB7_4: ; %Flow5 +; GCN-IR-NEXT: .LBB7_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-IR-NEXT: .LBB7_5: ; %udiv-end @@ -1241,12 +1241,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB8_5: ; %Flow3 +; GCN-IR-NEXT: .LBB8_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 ; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 -; GCN-IR-NEXT: .LBB8_6: ; %Flow4 +; GCN-IR-NEXT: .LBB8_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 @@ -1333,12 +1333,12 @@ ; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB9_5: ; %Flow3 +; GCN-IR-NEXT: .LBB9_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB9_6: ; %Flow4 +; GCN-IR-NEXT: .LBB9_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2