diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -433,6 +433,8 @@ // Current recorded maximum possible occupancy. unsigned Occupancy; + mutable Optional UsesAGPRs; + MCPhysReg getNextUserSGPR() const; MCPhysReg getNextSystemSGPR() const; @@ -946,6 +948,9 @@ Occupancy = Limit; limitOccupancy(MF); } + + // \returns true if a function needs or may need AGPRs. + bool usesAGPRs(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -651,3 +651,35 @@ } return false; } + +bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const { + if (UsesAGPRs) + return *UsesAGPRs; + + if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) || + MF.getFrameInfo().hasCalls()) { + UsesAGPRs = true; + return true; + } + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + const Register Reg = Register::index2VirtReg(I); + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + if (RC && SIRegisterInfo::isAGPRClass(RC)) { + UsesAGPRs = true; + return true; + } + } + + for (MCRegister Reg : AMDGPU::AGPR_32RegClass) { + if (MRI.isPhysRegUsed(Reg)) { + UsesAGPRs = true; + return true; + } + } + + UsesAGPRs = false; + return false; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -501,18 +501,36 @@ reserveRegisterTuples(Reserved, Reg); } + const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); - // TODO: In an entry function without calls and AGPRs used it is possible - // to use the whole register budget for VGPRs. Even more it shall - // be possible to estimate maximum AGPR/VGPR pressure and split - // register file accordingly. - if (ST.hasGFX90AInsts()) - MaxNumVGPRs /= 2; + unsigned MaxNumAGPRs = MaxNumVGPRs; unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + + if (ST.hasGFX90AInsts()) { + // In an entry function without calls and AGPRs used it is possible to use + // the whole register budget for VGPRs. + + // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and + // split register file accordingly. + if (MFI->usesAGPRs(MF)) { + MaxNumVGPRs /= 2; + MaxNumAGPRs = MaxNumVGPRs; + } else { + if (MaxNumVGPRs > TotalNumVGPRs) { + MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; + MaxNumVGPRs = TotalNumVGPRs; + } else + MaxNumAGPRs = 0; + } + } + for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); - Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + } + + for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { + unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); } @@ -536,8 +554,6 @@ } } - const SIMachineFunctionInfo *MFI = MF.getInfo(); - Register ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -12678,216 +12678,216 @@ ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s3, s3, s10 ; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX90A-NEXT: s_sub_u32 s14, 0, s12 ; GFX90A-NEXT: s_subb_u32 s15, 0, s13 -; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 -; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mac_f32_e32 v1, s16, v2 +; GFX90A-NEXT: v_rcp_f32_e32 v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 -; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 -; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s17, v1 +; GFX90A-NEXT: v_mul_f32_e32 v2, s18, v1 +; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 +; GFX90A-NEXT: v_mac_f32_e32 v1, s19, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_hi_u32 v3, s14, v0 -; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v1 -; GFX90A-NEXT: v_mul_lo_u32 v2, s15, v0 -; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX90A-NEXT: v_mul_lo_u32 v6, s14, v0 -; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_mul_hi_u32 v4, s14, v1 +; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, s15, v1 +; GFX90A-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v6, s14, v1 +; GFX90A-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v6 +; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 -; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v2, v6 +; GFX90A-NEXT: v_mul_lo_u32 v6, v2, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v6, s14, v3 -; GFX90A-NEXT: v_mul_hi_u32 v7, s14, v0 -; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX90A-NEXT: v_mul_lo_u32 v7, s15, v0 -; GFX90A-NEXT: v_add_u32_e32 v6, v6, v7 -; GFX90A-NEXT: v_mul_lo_u32 v8, s14, v0 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v2, v4, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, s14, v1 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s15, v1 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_mul_lo_u32 v8, s14, v1 ; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 ; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v6 -; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 -; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v6 +; GFX90A-NEXT: v_mul_lo_u32 v12, v1, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v1, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 ; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v2, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v5, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s4, s14 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 ; GFX90A-NEXT: s_mov_b32 s15, s14 ; GFX90A-NEXT: s_addc_u32 s1, s5, s14 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] -; GFX90A-NEXT: v_mul_lo_u32 v5, s4, v1 -; GFX90A-NEXT: v_mul_hi_u32 v6, s4, v0 -; GFX90A-NEXT: v_mul_hi_u32 v3, s4, v1 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX90A-NEXT: v_mul_lo_u32 v4, s4, v2 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, s4, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 -; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 -; GFX90A-NEXT: v_mul_hi_u32 v6, s5, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v1 ; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX90A-NEXT: v_mul_hi_u32 v5, s12, v0 -; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 -; GFX90A-NEXT: v_mul_lo_u32 v5, s13, v0 -; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX90A-NEXT: v_mul_lo_u32 v6, s12, v0 -; GFX90A-NEXT: v_sub_u32_e32 v5, s5, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s12, v2 +; GFX90A-NEXT: v_mul_hi_u32 v4, s12, v1 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v5, s12, v1 +; GFX90A-NEXT: v_sub_u32_e32 v4, s5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v7, s13 -; GFX90A-NEXT: v_sub_co_u32_e32 v6, vcc, s4, v6 -; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v7, vcc -; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s12, v6 -; GFX90A-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v5 +; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 +; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s12, v5 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v5 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v7, s[0:1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v8, v7, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] ; GFX90A-NEXT: v_mov_b32_e32 v8, s5 -; GFX90A-NEXT: v_add_co_u32_e64 v5, s[0:1], v0, v5 +; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v1, v4 ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX90A-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v6 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 ; GFX90A-NEXT: s_xor_b64 s[0:1], s[14:15], s[10:11] ; GFX90A-NEXT: s_ashr_i32 s4, s9, 31 -; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v3 ; GFX90A-NEXT: s_add_u32 s8, s8, s4 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc ; GFX90A-NEXT: s_mov_b32 s5, s4 ; GFX90A-NEXT: s_addc_u32 s9, s9, s4 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX90A-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s9 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX90A-NEXT: v_xor_b32_e32 v5, s1, v2 +; GFX90A-NEXT: v_mac_f32_e32 v3, s16, v4 +; GFX90A-NEXT: v_rcp_f32_e32 v4, v3 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, s1 +; GFX90A-NEXT: v_mul_f32_e32 v1, s17, v4 +; GFX90A-NEXT: v_mul_f32_e32 v4, s18, v1 +; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 +; GFX90A-NEXT: v_mac_f32_e32 v1, s19, v4 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX90A-NEXT: s_sub_u32 s10, 0, s8 -; GFX90A-NEXT: v_mac_f32_e32 v3, s16, v5 -; GFX90A-NEXT: v_rcp_f32_e32 v3, v3 -; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v6, s1 -; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v3, s17, v3 -; GFX90A-NEXT: v_mul_f32_e32 v5, s18, v3 -; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 -; GFX90A-NEXT: v_mac_f32_e32 v3, s19, v5 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc ; GFX90A-NEXT: s_subb_u32 s11, 0, s9 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v3 -; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v5 -; GFX90A-NEXT: v_mul_lo_u32 v6, s11, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v1 +; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v4 +; GFX90A-NEXT: v_mul_lo_u32 v5, s11, v1 ; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 -; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX90A-NEXT: v_mul_lo_u32 v9, s10, v3 -; GFX90A-NEXT: v_mul_lo_u32 v8, v3, v6 -; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v9 -; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v6 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, s10, v1 +; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v1, v9 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v9 -; GFX90A-NEXT: v_mul_lo_u32 v9, v5, v9 +; GFX90A-NEXT: v_mul_hi_u32 v11, v4, v9 +; GFX90A-NEXT: v_mul_lo_u32 v9, v4, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX90A-NEXT: v_mul_hi_u32 v10, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v10, v4, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v6, v5, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v2, v8, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 -; GFX90A-NEXT: v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v6 -; GFX90A-NEXT: v_mul_hi_u32 v9, s10, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v4, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v5 +; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v4, v7, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, s10, v1 ; GFX90A-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX90A-NEXT: v_mul_lo_u32 v9, s11, v3 +; GFX90A-NEXT: v_mul_lo_u32 v9, s11, v1 ; GFX90A-NEXT: v_add_u32_e32 v8, v8, v9 -; GFX90A-NEXT: v_mul_lo_u32 v10, s10, v3 -; GFX90A-NEXT: v_mul_hi_u32 v11, v6, v10 -; GFX90A-NEXT: v_mul_lo_u32 v12, v6, v10 -; GFX90A-NEXT: v_mul_lo_u32 v14, v3, v8 -; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v10 -; GFX90A-NEXT: v_mul_hi_u32 v13, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, s10, v1 +; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v10 +; GFX90A-NEXT: v_mul_lo_u32 v12, v5, v10 +; GFX90A-NEXT: v_mul_lo_u32 v14, v1, v8 +; GFX90A-NEXT: v_mul_hi_u32 v10, v1, v10 +; GFX90A-NEXT: v_mul_hi_u32 v13, v1, v8 ; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 -; GFX90A-NEXT: v_mul_hi_u32 v9, v6, v8 +; GFX90A-NEXT: v_mul_hi_u32 v9, v5, v8 ; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v6, v6, v8 -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v2, v9, vcc -; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v5, v8 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v9, vcc +; GFX90A-NEXT: v_add_u32_e32 v4, v4, v7 ; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 -; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s6, s10 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v5 ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v5 -; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v3 -; GFX90A-NEXT: v_mul_hi_u32 v6, s6, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v4 +; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v4 ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v3 -; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 -; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v5, s7, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v6, vcc -; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v2 -; GFX90A-NEXT: v_mul_hi_u32 v6, s8, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1 +; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, s7, v4 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v4 +; GFX90A-NEXT: v_mul_hi_u32 v6, s8, v1 ; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX90A-NEXT: v_mul_lo_u32 v6, s9, v3 +; GFX90A-NEXT: v_mul_lo_u32 v6, s9, v1 ; GFX90A-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX90A-NEXT: v_mul_lo_u32 v7, s8, v3 +; GFX90A-NEXT: v_mul_lo_u32 v7, s8, v1 ; GFX90A-NEXT: v_sub_u32_e32 v6, s7, v5 ; GFX90A-NEXT: v_mov_b32_e32 v8, s9 ; GFX90A-NEXT: v_sub_co_u32_e32 v7, vcc, s6, v7 @@ -12909,19 +12909,19 @@ ; GFX90A-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v5 -; GFX90A-NEXT: v_add_co_u32_e64 v6, s[0:1], v3, v6 +; GFX90A-NEXT: v_add_co_u32_e64 v6, s[0:1], v1, v6 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc -; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v2, s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v4, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX90A-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX90A-NEXT: v_xor_b32_e32 v5, s1, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX90A-NEXT: v_xor_b32_e32 v5, s1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v6, s1 -; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: v_subrev_co_u32_e32 v4, vcc, s0, v1 +; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3] ; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y @@ -14426,222 +14426,222 @@ ; GFX90A-NEXT: s_mov_b32 s5, s4 ; GFX90A-NEXT: s_addc_u32 s3, s3, s4 ; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX90A-NEXT: s_sub_u32 s2, 0, s12 ; GFX90A-NEXT: s_subb_u32 s3, 0, s13 -; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 -; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mac_f32_e32 v1, s16, v2 +; GFX90A-NEXT: v_rcp_f32_e32 v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 -; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 -; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s17, v1 +; GFX90A-NEXT: v_mul_f32_e32 v2, s18, v1 +; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 +; GFX90A-NEXT: v_mac_f32_e32 v1, s19, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 ; GFX90A-NEXT: s_mov_b32 s15, s14 -; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v1 -; GFX90A-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v0 -; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX90A-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v6 +; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 -; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v2, v6 +; GFX90A-NEXT: v_mul_lo_u32 v6, v2, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 -; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v3 -; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v0 -; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX90A-NEXT: v_mul_lo_u32 v7, s3, v0 -; GFX90A-NEXT: v_add_u32_e32 v6, v6, v7 -; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v0 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v2, v4, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v1 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s3, v1 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v1 ; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 ; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 -; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v6 -; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 -; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v6 +; GFX90A-NEXT: v_mul_lo_u32 v12, v1, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v1, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 ; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v2, v7, vcc -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v5 -; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v5, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s4, s14 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 ; GFX90A-NEXT: s_addc_u32 s1, s5, s14 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] -; GFX90A-NEXT: v_mul_lo_u32 v5, s4, v1 -; GFX90A-NEXT: v_mul_hi_u32 v6, s4, v0 -; GFX90A-NEXT: v_mul_hi_u32 v3, s4, v1 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX90A-NEXT: v_mul_lo_u32 v4, s4, v2 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, s4, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 -; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 -; GFX90A-NEXT: v_mul_hi_u32 v6, s5, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v1 ; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s12, v2 +; GFX90A-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v1 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX90A-NEXT: v_mul_lo_u32 v1, s12, v1 -; GFX90A-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 -; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v0 -; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX90A-NEXT: v_mul_lo_u32 v0, s12, v0 -; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, s13 -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 -; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v5, vcc -; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s12, v0 +; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, s13 +; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, s4, v1 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v1 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v3, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v6 -; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v5, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 -; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v6 +; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s12, v5 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] -; GFX90A-NEXT: v_mov_b32_e32 v6, s5 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v5, s5 +; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v1 ; GFX90A-NEXT: s_ashr_i32 s0, s11, 31 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 ; GFX90A-NEXT: s_add_u32 s2, s10, s0 -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX90A-NEXT: s_mov_b32 s1, s0 ; GFX90A-NEXT: s_addc_u32 s3, s11, s0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX90A-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX90A-NEXT: v_xor_b32_e32 v0, s14, v0 -; GFX90A-NEXT: s_sub_u32 s2, 0, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s5 ; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 -; GFX90A-NEXT: v_mac_f32_e32 v3, s16, v5 -; GFX90A-NEXT: v_rcp_f32_e32 v3, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, s14 -; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 -; GFX90A-NEXT: v_mul_f32_e32 v3, s17, v3 -; GFX90A-NEXT: v_mul_f32_e32 v5, s18, v3 -; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 -; GFX90A-NEXT: v_mac_f32_e32 v3, s19, v5 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX90A-NEXT: v_xor_b32_e32 v5, s14, v2 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s14, v1 +; GFX90A-NEXT: v_mac_f32_e32 v3, s16, v4 +; GFX90A-NEXT: v_rcp_f32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, s14 +; GFX90A-NEXT: s_sub_u32 s2, 0, s4 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: v_mul_f32_e32 v1, s17, v4 +; GFX90A-NEXT: v_mul_f32_e32 v4, s18, v1 +; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 +; GFX90A-NEXT: v_mac_f32_e32 v1, s19, v4 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX90A-NEXT: s_subb_u32 s3, 0, s5 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v3 -; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v5 -; GFX90A-NEXT: v_mul_lo_u32 v6, s3, v3 +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 +; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v4 +; GFX90A-NEXT: v_mul_lo_u32 v5, s3, v1 ; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 -; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX90A-NEXT: v_mul_lo_u32 v9, s2, v3 -; GFX90A-NEXT: v_mul_lo_u32 v8, v3, v6 -; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v9 -; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v6 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v1, v9 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v9 -; GFX90A-NEXT: v_mul_lo_u32 v9, v5, v9 +; GFX90A-NEXT: v_mul_hi_u32 v11, v4, v9 +; GFX90A-NEXT: v_mul_lo_u32 v9, v4, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX90A-NEXT: v_mul_hi_u32 v10, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v10, v4, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v6, v5, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v2, v8, vcc -; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 -; GFX90A-NEXT: v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1] -; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v6 -; GFX90A-NEXT: v_mul_hi_u32 v9, s2, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v4, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v5 +; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v4, v7, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, s2, v1 ; GFX90A-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX90A-NEXT: v_mul_lo_u32 v9, s3, v3 +; GFX90A-NEXT: v_mul_lo_u32 v9, s3, v1 ; GFX90A-NEXT: v_add_u32_e32 v8, v8, v9 -; GFX90A-NEXT: v_mul_lo_u32 v10, s2, v3 -; GFX90A-NEXT: v_mul_hi_u32 v11, v6, v10 -; GFX90A-NEXT: v_mul_lo_u32 v12, v6, v10 -; GFX90A-NEXT: v_mul_lo_u32 v14, v3, v8 -; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v10 -; GFX90A-NEXT: v_mul_hi_u32 v13, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, s2, v1 +; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v10 +; GFX90A-NEXT: v_mul_lo_u32 v12, v5, v10 +; GFX90A-NEXT: v_mul_lo_u32 v14, v1, v8 +; GFX90A-NEXT: v_mul_hi_u32 v10, v1, v10 +; GFX90A-NEXT: v_mul_hi_u32 v13, v1, v8 ; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 -; GFX90A-NEXT: v_mul_hi_u32 v9, v6, v8 +; GFX90A-NEXT: v_mul_hi_u32 v9, v5, v8 ; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v11, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v6, v6, v8 -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v2, v9, vcc -; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 -; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 -; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v5, v8 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v9, vcc +; GFX90A-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX90A-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s6, s10 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v5 ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v5 -; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v3 -; GFX90A-NEXT: v_mul_hi_u32 v6, s6, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v4 +; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v4 ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v3 -; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 -; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v9, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v4, vcc -; GFX90A-NEXT: v_mul_lo_u32 v5, s7, v5 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v6, vcc -; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v2 -; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v3 -; GFX90A-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 -; GFX90A-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v3 -; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1 +; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, s7, v4 +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, s4, v4 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v1 +; GFX90A-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v1 +; GFX90A-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX90A-NEXT: v_mul_lo_u32 v1, s4, v1 +; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v4 ; GFX90A-NEXT: v_mov_b32_e32 v6, s5 -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 ; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v3 +; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v1 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v8 ; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] @@ -14655,23 +14655,23 @@ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 ; GFX90A-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] ; GFX90A-NEXT: v_mov_b32_e32 v7, s7 -; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 ; GFX90A-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX90A-NEXT: v_xor_b32_e32 v3, s10, v3 -; GFX90A-NEXT: v_xor_b32_e32 v5, s10, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX90A-NEXT: v_xor_b32_e32 v1, s10, v1 +; GFX90A-NEXT: v_xor_b32_e32 v5, s10, v4 ; GFX90A-NEXT: v_mov_b32_e32 v6, s10 -; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v3 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX90A-NEXT: v_subrev_co_u32_e32 v4, vcc, s10, v1 +; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[8:9] ; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll @@ -548,6 +548,7 @@ ; GFX10CU-WAVE32: NumVgprs: 128 ; GFX10CU-WAVE64: NumVgprs: 128 define amdgpu_kernel void @f512() #512 { + call void @foo() call void @use256vgprs() ret void } @@ -563,7 +564,11 @@ ; GFX10CU-WAVE32: NumVgprs: 64 ; GFX10CU-WAVE64: NumVgprs: 64 define amdgpu_kernel void @f1024() #1024 { + call void @foo() call void @use256vgprs() ret void } + attributes #1024 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" } + +declare void @foo() diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll copy from llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll copy to llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll @@ -1,11 +1,6 @@ ; -enable-misched=false makes the register usage more predictable ; -regalloc=fast just makes the test run faster -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9 ; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A -; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false --sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64 define internal void @use256vgprs() { %v0 = call i32 asm sideeffect "; def $0", "=v"() @@ -523,47 +518,183 @@ ret void } -; GCN-LABEL: {{^}}f256: -; GFX9: NumVgprs: 256 +define internal void @use512vgprs() { + %v0 = call <32 x i32> asm sideeffect "; def $0", "=v"() + %v1 = call <32 x i32> asm sideeffect "; def $0", "=v"() + %v2 = call <32 x i32> asm sideeffect "; def $0", "=v"() + %v3 = call <32 x i32> asm sideeffect "; def $0", "=v"() + %v4 = call <32 x i32> asm sideeffect "; def $0", "=v"() + %v5 = call <32 x i32> asm sideeffect "; def $0", "=v"() + %v6 = call <32 x i32> asm sideeffect "; def $0", "=v"() + %v7 = call <32 x i32> asm sideeffect "; def $0", "=v"() + call void @use256vgprs() + call void asm sideeffect "; use $0", "v"(<32 x i32> %v0) + call void asm sideeffect "; use $0", "v"(<32 x i32> %v1) + call void asm sideeffect "; use $0", "v"(<32 x i32> %v2) + call void asm sideeffect "; use $0", "v"(<32 x i32> %v3) + call void asm sideeffect "; use $0", "v"(<32 x i32> %v4) + call void asm sideeffect "; use $0", "v"(<32 x i32> %v5) + call void asm sideeffect "; use $0", "v"(<32 x i32> %v6) + call void asm sideeffect "; use $0", "v"(<32 x i32> %v7) + ret void +} + +define void @foo() #0 { + ret void +} + +attributes #0 = { noinline } + +; GCN-LABEL: {{^}}k256_w8: +; GFX90A: NumVgprs: 32 +; GFX90A: NumAgprs: 32 +; GFX90A: TotalNumVgprs: 64 +define amdgpu_kernel void @k256_w8() #2568 { + call void @foo() + call void @use256vgprs() + ret void +} + +; GCN-LABEL: {{^}}k256_w8_no_agprs: +; GFX90A: NumVgprs: 64 +; GFX90A: NumAgprs: 0 +; GFX90A: TotalNumVgprs: 64 +define amdgpu_kernel void @k256_w8_no_agprs() #2568 { + call void @use256vgprs() + ret void +} + +attributes #2568 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="8" } + +; GCN-LABEL: {{^}}k256_w4: +; GFX90A: NumVgprs: 64 +; GFX90A: NumAgprs: 64 +; GFX90A: TotalNumVgprs: 128 +define amdgpu_kernel void @k256_w4() #2564 { + call void @foo() + call void @use256vgprs() + ret void +} + +; GCN-LABEL: {{^}}k256_w4_no_agprs: +; GFX90A: NumVgprs: 128 +; GFX90A: NumAgprs: 0 +; GFX90A: TotalNumVgprs: 128 +define amdgpu_kernel void @k256_w4_no_agprs() #2564 { + call void @use256vgprs() + ret void +} + +attributes #2564 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="4" } + +; GCN-LABEL: {{^}}k256_w2: +; GFX90A: NumVgprs: 128 +; GFX90A: NumAgprs: 128 +; GFX90A: TotalNumVgprs: 256 +define amdgpu_kernel void @k256_w2() #2562 { + call void @foo() + call void @use256vgprs() + ret void +} + +; GCN-LABEL: {{^}}k256_w2_no_agprs: ; GFX90A: NumVgprs: 256 ; GFX90A: NumAgprs: 0 ; GFX90A: TotalNumVgprs: 256 -; GFX10WGP-WAVE32: NumVgprs: 256 -; GFX10WGP-WAVE64: NumVgprs: 256 -; GFX10CU-WAVE32: NumVgprs: 256 -; GFX10CU-WAVE64: NumVgprs: 256 -define amdgpu_kernel void @f256() #256 { +define amdgpu_kernel void @k256_w2_no_agprs() #2562 { call void @use256vgprs() ret void } -attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" } -; GCN-LABEL: {{^}}f512: -; GFX9: NumVgprs: 128 +attributes #2562 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2" } + +; GCN-LABEL: {{^}}k256_w1: +; GFX90A: NumVgprs: 256 +; GFX90A: NumAgprs: 256 +; GFX90A: TotalNumVgprs: 512 +define amdgpu_kernel void @k256_w1() #2561 { + call void @foo() + call void @use512vgprs() + ret void +} + +; GCN-LABEL: {{^}}k256_w1_no_agprs: +; GFX90A: NumVgprs: 256 +; GFX90A: NumAgprs: 256 +; GFX90A: TotalNumVgprs: 512 +define amdgpu_kernel void @k256_w1_no_agprs() #2561 { + call void @use512vgprs() + ret void +} + +attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="1" } + +; GCN-LABEL: {{^}}k512_no_agprs: +; GFX90A: NumVgprs: 256 +; GFX90A: NumAgprs: 0 +; GFX90A: TotalNumVgprs: 256 +define amdgpu_kernel void @k512_no_agprs() #512 { + call void @use256vgprs() + ret void +} + +; GCN-LABEL: {{^}}k512_call: +; GFX90A: NumVgprs: 128 +; GFX90A: NumAgprs: 128 +; GFX90A: TotalNumVgprs: 256 +define amdgpu_kernel void @k512_call() #512 { + call void @foo() + call void @use256vgprs() + ret void +} + +; GCN-LABEL: {{^}}k512_virtual_agpr: ; GFX90A: NumVgprs: 128 ; GFX90A: NumAgprs: 128 ; GFX90A: TotalNumVgprs: 256 -; GFX10WGP-WAVE32: NumVgprs: 256 -; GFX10WGP-WAVE64: NumVgprs: 256 -; GFX10CU-WAVE32: NumVgprs: 128 -; GFX10CU-WAVE64: NumVgprs: 128 -define amdgpu_kernel void @f512() #512 { +define amdgpu_kernel void @k512_virtual_agpr() #512 { + %a0 = call i32 asm sideeffect "; def $0", "=a"() call void @use256vgprs() ret void } + +; GCN-LABEL: {{^}}k512_physical_agpr: +; GFX90A: NumVgprs: 128 +; GFX90A: NumAgprs: 128 +; GFX90A: TotalNumVgprs: 256 +define amdgpu_kernel void @k512_physical_agpr() #512 { + call void asm sideeffect "", "~{a8}" () + call void @use256vgprs() + ret void +} + +; GCN-LABEL: {{^}}f512: +; GFX90A: NumVgprs: 12{{[0-9]}} +; GFX90A: NumAgprs: {{[1-9]}} +define void @f512() #512 { + call void @use256vgprs() + ret void +} + attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" } -; GCN-LABEL: {{^}}f1024: -; GFX9: NumVgprs: 64 +; GCN-LABEL: {{^}}k1024: +; GFX90A: NumVgprs: 128 +; GFX90A: NumAgprs: 0 +; GFX90A: TotalNumVgprs: 128 +define amdgpu_kernel void @k1024() #1024 { + call void @use256vgprs() + ret void +} + +; GCN-LABEL: {{^}}k1024_call: ; GFX90A: NumVgprs: 64 ; GFX90A: NumAgprs: 64 ; GFX90A: TotalNumVgprs: 128 -; GFX10WGP-WAVE32: NumVgprs: 128 -; GFX10WGP-WAVE64: NumVgprs: 128 -; GFX10CU-WAVE32: NumVgprs: 64 -; GFX10CU-WAVE64: NumVgprs: 64 -define amdgpu_kernel void @f1024() #1024 { +define amdgpu_kernel void @k1024_call() #1024 { + call void @foo() call void @use256vgprs() ret void } + attributes #1024 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }