Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -349,7 +349,7 @@ def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add (sequence "SGPR%u_LO16", 0, 105))> { - let AllocationPriority = 9; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; let HasSGPR = 1; @@ -368,7 +368,7 @@ (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. - let AllocationPriority = 9; + let AllocationPriority = 0; let GeneratePressureSet = 0; let HasSGPR = 1; } @@ -528,14 +528,14 @@ let HasVGPR = 1 in { def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (sequence "VGPR%u_LO16", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; } def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (sequence "VGPR%u_HI16", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; } @@ -544,7 +544,7 @@ // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 32; let Weight = 1; } @@ -588,7 +588,7 @@ // AccVGPR 32-bit registers def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "AGPR%u", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 32; let Weight = 1; } @@ -653,7 +653,7 @@ SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, @@ -663,42 +663,42 @@ SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } } // End GeneratePressureSet = 0 // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { - let AllocationPriority = 10; + let AllocationPriority = 0; let HasSGPR = 1; } @@ -712,7 +712,7 @@ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; - let AllocationPriority = 11; + let AllocationPriority = 1; let HasSGPR = 1; } @@ -725,14 +725,14 @@ def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> { let CopyCost = 1; - let AllocationPriority = 13; + let AllocationPriority = 1; let HasSGPR = 1; } def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; - let AllocationPriority = 13; + let AllocationPriority = 1; let HasSGPR = 1; } @@ -750,7 +750,7 @@ let HasSGPR = 1; } -multiclass SRegClass regTypes, SIRegisterTuples regList, SIRegisterTuples ttmpList = regList, @@ -760,7 +760,7 @@ defvar sgprName = !strconcat("SGPR_", suffix); defvar ttmpName = !strconcat("TTMP_", suffix); - let AllocationPriority = priority, CopyCost = copyCost, HasSGPR = 1 in { + let AllocationPriority = !sub(numRegs, 1), CopyCost = copyCost, HasSGPR = 1 in { def "" # sgprName : SIRegisterClass<"AMDGPU", regTypes, 32, (add regList)> { } @@ -781,14 +781,14 @@ } } -defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; -defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; -defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; -defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; -defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; -defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; +defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; +defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; +defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; +defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; +defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, LDS_DIRECT_CLASS)> { @@ -803,7 +803,7 @@ // Requires n v_mov_b32 to copy let CopyCost = numRegs; - let AllocationPriority = numRegs; + let AllocationPriority = !sub(numRegs, 1); let Weight = numRegs; } Index: llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -524,30 +524,30 @@ ; GFX908-NEXT: global_load_ushort v24, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX908-NEXT: s_load_dword s8, s[4:5], 0x18 -; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX908-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX908-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX908-NEXT: s_sub_i32 s4, 0, s1 -; GFX908-NEXT: s_lshr_b32 s11, s8, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s8 +; GFX908-NEXT: s_lshr_b32 s5, s6, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s6 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX908-NEXT: s_lshl_b64 s[8:9], s[2:3], 5 -; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s11 -; GFX908-NEXT: s_or_b32 s8, s8, 28 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5 +; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s5 +; GFX908-NEXT: s_or_b32 s10, s10, 28 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v7, s3 -; GFX908-NEXT: s_mov_b32 s10, 0 +; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], 5 ; GFX908-NEXT: v_mov_b32_e32 v6, s2 ; GFX908-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 +; GFX908-NEXT: s_mov_b32 s4, 0 ; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX908-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, s9 +; GFX908-NEXT: v_mov_b32_e32 v2, s10 +; GFX908-NEXT: v_mov_b32_e32 v3, s11 ; GFX908-NEXT: v_mul_lo_u32 v4, v0, s1 ; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX908-NEXT: v_sub_u32_e32 v4, s0, v4 @@ -562,11 +562,11 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s0, v24 ; GFX908-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX908-NEXT: s_mul_i32 s1, s7, s0 -; GFX908-NEXT: s_mul_hi_u32 s7, s6, s0 -; GFX908-NEXT: s_mul_i32 s0, s6, s0 -; GFX908-NEXT: s_add_i32 s1, s7, s1 -; GFX908-NEXT: s_lshl_b64 s[6:7], s[0:1], 5 +; GFX908-NEXT: s_mul_i32 s1, s9, s0 +; GFX908-NEXT: s_mul_hi_u32 s5, s8, s0 +; GFX908-NEXT: s_mul_i32 s0, s8, s0 +; GFX908-NEXT: s_add_i32 s1, s5, s1 +; GFX908-NEXT: s_lshl_b64 s[8:9], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 @@ -583,13 +583,13 @@ ; GFX908-NEXT: v_mov_b32_e32 v8, 0 ; GFX908-NEXT: v_mov_b32_e32 v9, 0 ; GFX908-NEXT: global_load_dwordx2 v[8:9], v[8:9], off -; GFX908-NEXT: s_mov_b32 s11, s10 -; GFX908-NEXT: v_mov_b32_e32 v13, s11 -; GFX908-NEXT: v_mov_b32_e32 v15, s11 -; GFX908-NEXT: v_mov_b32_e32 v17, s11 -; GFX908-NEXT: v_mov_b32_e32 v12, s10 -; GFX908-NEXT: v_mov_b32_e32 v14, s10 -; GFX908-NEXT: v_mov_b32_e32 v16, s10 +; GFX908-NEXT: s_mov_b32 s5, s4 +; GFX908-NEXT: v_mov_b32_e32 v13, s5 +; GFX908-NEXT: v_mov_b32_e32 v15, s5 +; GFX908-NEXT: v_mov_b32_e32 v17, s5 +; GFX908-NEXT: v_mov_b32_e32 v12, s4 +; GFX908-NEXT: v_mov_b32_e32 v14, s4 +; GFX908-NEXT: v_mov_b32_e32 v16, s4 ; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v11, v3 ; GFX908-NEXT: v_mov_b32_e32 v19, v13 @@ -600,27 +600,27 @@ ; GFX908-NEXT: v_readfirstlane_b32 s3, v9 ; GFX908-NEXT: s_add_u32 s2, s2, 1 ; GFX908-NEXT: s_addc_u32 s3, s3, 0 -; GFX908-NEXT: s_mul_hi_u32 s9, s4, s2 -; GFX908-NEXT: s_mul_i32 s11, s5, s2 -; GFX908-NEXT: s_mul_i32 s8, s4, s2 -; GFX908-NEXT: s_mul_i32 s2, s4, s3 -; GFX908-NEXT: s_add_i32 s2, s9, s2 -; GFX908-NEXT: s_add_i32 s9, s2, s11 +; GFX908-NEXT: s_mul_hi_u32 s5, s6, s2 +; GFX908-NEXT: s_mul_i32 s11, s7, s2 +; GFX908-NEXT: s_mul_i32 s10, s6, s2 +; GFX908-NEXT: s_mul_i32 s2, s6, s3 +; GFX908-NEXT: s_add_i32 s2, s5, s2 +; GFX908-NEXT: s_add_i32 s5, s2, s11 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v8, vcc, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v20, s7 -; GFX908-NEXT: v_add_co_u32_e64 v10, s[2:3], s6, v10 +; GFX908-NEXT: v_mov_b32_e32 v20, s9 +; GFX908-NEXT: v_add_co_u32_e64 v10, s[2:3], s8, v10 ; GFX908-NEXT: v_addc_co_u32_e64 v11, s[2:3], v11, v20, s[2:3] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_mov_b32_e32 v21, s9 -; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, s8, v10 +; GFX908-NEXT: v_mov_b32_e32 v21, s5 +; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, s10, v10 ; GFX908-NEXT: v_addc_co_u32_e32 v21, vcc, v11, v21, vcc ; GFX908-NEXT: global_load_dword v28, v[20:21], off offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -664,25 +664,25 @@ ; GFX90A-NEXT: global_load_ushort v28, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x10 -; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x18 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10 +; GFX90A-NEXT: s_load_dword s3, s[4:5], 0x18 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: s_mov_b32 s8, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX90A-NEXT: s_sub_i32 s9, 0, s7 -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[10:11], 5 -; GFX90A-NEXT: s_or_b32 s4, s4, 28 +; GFX90A-NEXT: s_sub_i32 s12, 0, s7 +; GFX90A-NEXT: s_lshr_b32 s13, s3, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: s_lshr_b32 s12, s2, 16 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s2 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s13 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 +; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s12 -; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[10:11], s[10:11] op_sel:[0,1] -; GFX90A-NEXT: v_mul_lo_u32 v8, s9, v0 +; GFX90A-NEXT: s_mov_b32 s2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_mul_lo_u32 v8, s12, v0 ; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v8 ; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 @@ -699,13 +699,13 @@ ; GFX90A-NEXT: v_lshlrev_b64 v[8:9], 5, v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s4, v28 -; GFX90A-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX90A-NEXT: s_mul_i32 s1, s1, s4 -; GFX90A-NEXT: s_mul_hi_u32 s5, s0, s4 -; GFX90A-NEXT: s_mul_i32 s0, s0, s4 -; GFX90A-NEXT: s_add_i32 s1, s5, s1 -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5 +; GFX90A-NEXT: v_readfirstlane_b32 s3, v28 +; GFX90A-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX90A-NEXT: s_mul_i32 s1, s1, s3 +; GFX90A-NEXT: s_mul_hi_u32 s6, s0, s3 +; GFX90A-NEXT: s_mul_i32 s0, s0, s3 +; GFX90A-NEXT: s_add_i32 s1, s6, s1 +; GFX90A-NEXT: s_lshl_b64 s[6:7], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 @@ -720,39 +720,39 @@ ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off -; GFX90A-NEXT: s_mov_b32 s9, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: s_mov_b32 s3, s2 +; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s6, v12 -; GFX90A-NEXT: v_readfirstlane_b32 s7, v13 -; GFX90A-NEXT: s_add_u32 s6, s6, 1 -; GFX90A-NEXT: s_addc_u32 s7, s7, 0 -; GFX90A-NEXT: s_mul_hi_u32 s9, s2, s6 -; GFX90A-NEXT: s_mul_i32 s7, s2, s7 -; GFX90A-NEXT: s_mul_i32 s10, s3, s6 -; GFX90A-NEXT: s_add_i32 s7, s9, s7 -; GFX90A-NEXT: s_mul_i32 s6, s2, s6 -; GFX90A-NEXT: s_add_i32 s7, s7, s10 +; GFX90A-NEXT: v_readfirstlane_b32 s3, v12 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v13 +; GFX90A-NEXT: s_add_u32 s3, s3, 1 +; GFX90A-NEXT: s_addc_u32 s9, s8, 0 +; GFX90A-NEXT: s_mul_hi_u32 s10, s4, s3 +; GFX90A-NEXT: s_mul_i32 s11, s5, s3 +; GFX90A-NEXT: s_mul_i32 s8, s4, s3 +; GFX90A-NEXT: s_mul_i32 s3, s4, s9 +; GFX90A-NEXT: s_add_i32 s3, s10, s3 +; GFX90A-NEXT: s_add_i32 s3, s3, s11 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX90A-NEXT: v_mov_b32_e32 v24, s5 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v24, s7 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s6, v14 ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v24, vcc ; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_mov_b32_e32 v25, s7 -; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s6, v14 +; GFX90A-NEXT: v_mov_b32_e32 v25, s3 +; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s8, v14 ; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v25, vcc ; GFX90A-NEXT: global_load_dword v30, v[24:25], off offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -28,11 +28,11 @@ ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 ; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 -; GFX9-DAG: s_mov_b32 s6, s1 -; GFX9-DAG: s_mov_b32 s7, 0 -; GFX9-DAG: s_mov_b32 s1, s7 +; GFX9-DAG: s_mov_b32 s2, s1 +; GFX9-DAG: s_mov_b32 s3, 0 +; GFX9-DAG: s_mov_b32 s1, s3 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(6)* %p1, i32 2 %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0 @@ -125,11 +125,11 @@ ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 ; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 ; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 -; GFX9-DAG: s_mov_b32 s6, s1 -; GFX9-DAG: s_mov_b32 s7, 0 -; GFX9-DAG: s_mov_b32 s1, s7 +; GFX9-DAG: s_mov_b32 s2, s1 +; GFX9-DAG: s_mov_b32 s3, 0 +; GFX9-DAG: s_mov_b32 s1, s3 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10 +; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 { %gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(6)* %p1, i32 2 %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0 Index: llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir +++ llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir @@ -26,6 +26,7 @@ ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead %3:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit [[DEF1]] ; CHECK-NEXT: S_NOP 0, implicit [[DEF1]] ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF @@ -34,16 +35,16 @@ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 { - ; CHECK-NEXT: internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 + ; CHECK-NEXT: undef %6.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 { + ; CHECK-NEXT: internal %6.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 ; CHECK-NEXT: } - ; CHECK-NEXT: %5.sub0:av_1024_align2 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit %5.sub0 + ; CHECK-NEXT: %6.sub0:av_1024_align2 = IMPLICIT_DEF + ; CHECK-NEXT: S_NOP 0, implicit %6.sub0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit %5 + ; CHECK-NEXT: S_NOP 0, implicit %6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) @@ -52,14 +53,15 @@ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: - ; CHECK-NEXT: undef %3.sub0:vreg_1024_align2 = COPY [[DEF]] - ; CHECK-NEXT: S_NOP 0, implicit %3 + ; CHECK-NEXT: undef %4.sub0:vreg_1024_align2 = COPY [[DEF]] + ; CHECK-NEXT: S_NOP 0, implicit %4 bb.0: %0:vgpr_32 = IMPLICIT_DEF %1:vreg_1024_align2 = IMPLICIT_DEF %2:vreg_1024_align2 = COPY %1 bb.1: + %5:vreg_64 = IMPLICIT_DEF S_NOP 0, implicit %1 S_NOP 0, implicit %1 %1:vreg_1024_align2 = IMPLICIT_DEF Index: llvm/test/CodeGen/AMDGPU/indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -1144,13 +1144,13 @@ ; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GCN-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] +; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] ; GCN-NEXT: s_cbranch_execnz .LBB6_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -1337,14 +1337,14 @@ ; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s9, v2 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: v_mov_b32_e32 v0, v41 -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN-NEXT: s_xor_b64 exec, exec, s[6:7] +; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz .LBB7_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1539,15 +1539,15 @@ ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s9, v2 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_xor_b64 exec, exec, s[6:7] +; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz .LBB8_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -1736,13 +1736,13 @@ ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[6:7] +; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz .LBB9_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -1019,10 +1019,10 @@ ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-32-NEXT: s_mov_b32 s2, 0 +; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo @@ -1031,30 +1031,30 @@ ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_mov_b32 s3, 0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_mov_b32 s2, 0 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_add_i32 s3, s3, 1 -; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s3, v1 -; GFX10-32-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_add_i32 s2, s2, 1 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 +; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s1, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s3, 0, s1 -; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 +; GFX10-32-NEXT: s_mov_b32 s3, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 +; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 ; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s4, s1 -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s4 +; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 +; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1066,7 +1066,7 @@ ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 Index: llvm/test/CodeGen/AMDGPU/spill-vgpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vgpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-vgpr.ll @@ -135,13 +135,13 @@ ; GFX908-DAG: v_accvgpr_read_b32 ; GFX900: NumVgprs: 256 -; GFX908: NumVgprs: 252 -; GFX900: ScratchSize: 2052 +; GFX908: NumVgprs: 254 +; GFX900: ScratchSize: 1796 ; GFX908: ScratchSize: 0 ; GFX900: VGPRBlocks: 63 -; GFX908: VGPRBlocks: 62 +; GFX908: VGPRBlocks: 63 ; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 252 +; GFX908: NumVGPRsForWavesPerEU: 25 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1955,19 +1955,19 @@ ; GCN-LABEL: s_test_srem24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s6, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[4:5], s[2:3], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-NEXT: s_ashr_i32 s5, s4, 30 -; GCN-NEXT: s_or_b32 s5, s5, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_mov_b32 s5, 0x41c00000 +; GCN-NEXT: s_ashr_i32 s6, s4, 30 +; GCN-NEXT: s_or_b32 s6, s6, 1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v2, -v1, v0, s6 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s5 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc @@ -1982,19 +1982,19 @@ ; GCN-IR-LABEL: s_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s6, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[2:3], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-IR-NEXT: s_ashr_i32 s5, s4, 30 -; GCN-IR-NEXT: s_or_b32 s5, s5, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s5 +; GCN-IR-NEXT: s_mov_b32 s5, 0x41c00000 +; GCN-IR-NEXT: s_ashr_i32 s6, s4, 30 +; GCN-IR-NEXT: s_or_b32 s6, s6, 1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s6 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s6 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s5 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc Index: llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -217,10 +217,10 @@ ; GCN-NEXT: s_mov_b32 s45, s14 ; GCN-NEXT: s_mov_b32 s46, s13 ; GCN-NEXT: s_mov_b32 s47, s12 -; GCN-NEXT: s_mov_b64 s[36:37], s[10:11] -; GCN-NEXT: s_mov_b64 s[38:39], s[8:9] -; GCN-NEXT: s_mov_b64 s[40:41], s[6:7] -; GCN-NEXT: s_mov_b64 s[42:43], s[4:5] +; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] +; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] +; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] +; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -233,7 +233,7 @@ ; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 ; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v44 +; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v44 ; GCN-NEXT: s_branch .LBB1_3 ; GCN-NEXT: .LBB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 @@ -275,10 +275,10 @@ ; GCN-NEXT: ; %bb.7: ; %bb11 ; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b64 s[4:5], s[42:43] -; GCN-NEXT: s_mov_b64 s[6:7], s[40:41] -; GCN-NEXT: s_mov_b64 s[8:9], s[38:39] -; GCN-NEXT: s_mov_b64 s[10:11], s[36:37] +; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] ; GCN-NEXT: s_mov_b32 s12, s47 ; GCN-NEXT: s_mov_b32 s13, s46 ; GCN-NEXT: s_mov_b32 s14, s45 @@ -293,7 +293,7 @@ ; GCN-NEXT: ; %bb.8: ; %bb14 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[34:35] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[42:43] ; GCN-NEXT: s_cbranch_execnz .LBB1_10 ; GCN-NEXT: ; %bb.9: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1416,23 +1416,23 @@ ; GCN-LABEL: s_test_urem24_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_movk_i32 s3, 0x5b7f +; GCN-NEXT: s_mov_b32 s3, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s4, v0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 +; GCN-NEXT: v_mad_f32 v0, -v1, s3, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1441,23 +1441,23 @@ ; GCN-IR-LABEL: s_test_urem24_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-IR-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f +; GCN-IR-NEXT: s_mov_b32 s3, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s3, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0