diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -257,6 +257,10 @@ Optional ConstantFoldExtOp(unsigned Opcode, const Register Op1, uint64_t Imm, const MachineRegisterInfo &MRI); +Optional ConstantFoldIntToFloat(unsigned Opcode, LLT DstTy, + Register Src, + const MachineRegisterInfo &MRI); + /// Test if the given value is known to have exactly one bit set. This differs /// from computeKnownBits in that it doesn't necessarily determine which bit is /// set. diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -203,6 +203,16 @@ return buildConstant(Dst, *MaybeCst); break; } + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: { + // Try to constant fold these. + assert(SrcOps.size() == 1 && "Invalid sources"); + assert(DstOps.size() == 1 && "Invalid dsts"); + if (Optional Cst = ConstantFoldIntToFloat( + Opc, DstOps[0].getLLTTy(*getMRI()), SrcOps[0].getReg(), *getMRI())) + return buildFConstant(DstOps[0], *Cst); + break; + } } bool CanCopy = checkCopyToDefsPossible(DstOps); if (!canPerformCSEForOpc(Opc)) diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -673,6 +673,19 @@ return None; } +Optional llvm::ConstantFoldIntToFloat(unsigned Opcode, LLT DstTy, + Register Src, + const MachineRegisterInfo &MRI) { + assert(Opcode == TargetOpcode::G_SITOFP || Opcode == TargetOpcode::G_UITOFP); + if (auto MaybeSrcVal = getConstantVRegVal(Src, MRI)) { + APFloat DstVal(getFltSemanticForLLT(DstTy)); + DstVal.convertFromAPInt(*MaybeSrcVal, Opcode == TargetOpcode::G_SITOFP, + APFloat::rmNearestTiesToEven); + return DstVal; + } + return None; +} + bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, GISelKnownBits *KB) { Optional DefSrcReg = diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -689,8 +689,7 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, 2.0 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -68,9 +68,8 @@ ; GCN-LABEL: v_powi_neg1_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v1, -1 ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -1) @@ -82,8 +81,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 2 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 2) @@ -94,9 +92,8 @@ ; GCN-LABEL: v_powi_neg2_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v1, -2 ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -2) @@ -108,8 +105,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 4 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, 4.0, v0 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 4) @@ -121,8 +117,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 8 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, 0x41000000, v0 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 8) @@ -134,8 +129,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 16 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, 0x41800000, v0 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 16) @@ -147,8 +141,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 0x80 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, 0x43000000, v0 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 128) @@ -159,9 +152,8 @@ ; GCN-LABEL: v_powi_neg128_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v1, 0xffffff80 ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, 0xc3000000, v0 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -128) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -296,16 +296,15 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s6, 0x1000 ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xfffff000 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s6 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 @@ -379,56 +378,54 @@ ; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 +; CGP-NEXT: s_movk_i32 s8, 0x1000 ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_mov_b32_e32 v3, 0x1000 -; CGP-NEXT: s_movk_i32 s5, 0xf000 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 +; CGP-NEXT: s_movk_i32 s4, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x45800000 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, s4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s5, v6 +; CGP-NEXT: v_mul_lo_u32 v8, s4, v3 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v6 +; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v3 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 ; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s4, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v3 +; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, ret <2 x i32> %result @@ -440,16 +437,15 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s6 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 @@ -523,56 +519,54 @@ ; CGP-LABEL: v_sdiv_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x12d8fb +; CGP-NEXT: s_mov_b32 s8, 0x12d8fb ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb -; CGP-NEXT: s_mov_b32 s5, 0xffed2705 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 +; CGP-NEXT: s_mov_b32 s4, 0xffed2705 ; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x4996c7d8 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, s4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s5, v6 +; CGP-NEXT: v_mul_lo_u32 v8, s4, v3 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v6, s4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v6 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v3, s8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s4, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v3 +; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, ret <2 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -272,16 +272,15 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xfffff000 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s4 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -351,50 +350,48 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_mov_b32_e32 v3, 0x1000 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 ; CGP-NEXT: s_movk_i32 s5, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x45800000 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, s4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s5, v6 +; CGP-NEXT: v_mul_lo_u32 v8, s5, v3 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v6, 12, v6 +; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 ; CGP-NEXT: v_lshlrev_b32_e32 v4, 12, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v1, v3 +; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v1, v3 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, ret <2 x i32> %result @@ -406,16 +403,15 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s4 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_mul_lo_u32 v2, v2, s4 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -485,50 +481,48 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_mov_b32 s4, 0x12d8fb ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 ; CGP-NEXT: s_mov_b32 s5, 0xffed2705 ; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, 0x4996c7d8 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, s4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s5, v6 +; CGP-NEXT: v_mul_lo_u32 v8, s5, v3 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, s4 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v3, v3, s4 +; CGP-NEXT: v_mul_lo_u32 v4, v4, s4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v1, v3 +; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v1, v3 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, ret <2 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -223,14 +223,13 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_mov_b32_e32 v1, 0xfffff000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v2, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x45800000 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xfffff000 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 @@ -295,45 +294,43 @@ ; CGP-LABEL: v_udiv_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 -; CGP-NEXT: s_movk_i32 s5, 0xf000 +; CGP-NEXT: s_movk_i32 s8, 0x1000 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 +; CGP-NEXT: s_movk_i32 s4, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s4 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, 0x45800000 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v6, s5, v4 +; CGP-NEXT: v_mul_lo_u32 v6, s4, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v2, v6 ; CGP-NEXT: v_mul_hi_u32 v3, v5, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v5, 12, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; CGP-NEXT: v_lshlrev_b32_e32 v5, 12, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v3 ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CGP-NEXT: v_subrev_i32_e64 v5, s[4:5], s4, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_subrev_i32_e64 v5, s[4:5], s8, v0 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v2 +; CGP-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] ; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i32> %num, @@ -345,14 +342,13 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_mov_b32_e32 v1, 0xffed2705 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v2, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 @@ -417,44 +413,42 @@ ; CGP-LABEL: v_udiv_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x12d8fb -; CGP-NEXT: v_mov_b32_e32 v2, 0x12d8fb -; CGP-NEXT: s_mov_b32 s5, 0xffed2705 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, s4 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; CGP-NEXT: s_mov_b32 s8, 0x12d8fb +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 +; CGP-NEXT: s_mov_b32 s4, 0xffed2705 +; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x4996c7d8 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s5, v3 -; CGP-NEXT: v_mul_lo_u32 v6, s5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 +; CGP-NEXT: v_mul_lo_u32 v5, s4, v2 +; CGP-NEXT: v_mul_lo_u32 v6, s4, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v2, v5 ; CGP-NEXT: v_mul_hi_u32 v6, v4, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v5, v3, s4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; CGP-NEXT: v_mul_lo_u32 v7, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v2, s8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CGP-NEXT: v_mul_lo_u32 v7, v4, s8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; CGP-NEXT: v_subrev_i32_e64 v5, s[4:5], s4, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_subrev_i32_e64 v5, s[4:5], s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 ; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v2 +; CGP-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v3 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] ; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i32> %num, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -230,14 +230,13 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb -; CHECK-NEXT: v_mov_b32_e32 v1, 0xffed2705 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v2, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x4996c7d8 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_mul_lo_u32 v1, v1, s4 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -290,33 +289,31 @@ ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_mov_b32 s4, 0x12d8fb -; CGP-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 ; CGP-NEXT: s_mov_b32 s5, 0xffed2705 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, s4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, s5, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, s4 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, s5, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, s4 +; CGP-NEXT: v_mul_lo_u32 v2, v2, s4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 +; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 +; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i32> %num, ret <2 x i32> %result