diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -457,7 +457,10 @@ } assert(ScratchWaveOffsetReg); - if (MF.getFrameInfo().hasCalls()) { + // We need the SP if the function has calls, but we may also need it if the + // function has non spill stack objects (in some cases). So if they exist we + // initialize SP in case it is required. + if (MF.getFrameInfo().hasCalls() || MFI->hasNonSpillStackObjects()) { Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -15,6 +15,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -26,6 +27,7 @@ ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 @@ -117,6 +119,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -128,6 +131,7 @@ ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 @@ -219,6 +223,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 @@ -231,6 +236,7 @@ ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -18,12 +18,13 @@ ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s33, 0 +; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xc @@ -99,12 +100,13 @@ ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s33, 0 +; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -27,6 +27,7 @@ ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_movk_i32 s32, 0x200 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX803-NEXT: s_endpgm @@ -38,12 +39,14 @@ ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_movk_i32 s32, 0x200 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_movk_i32 s32, 0x100 ; GFX1010-NEXT: s_addc_u32 s5, s5, 0 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 @@ -191,6 +194,7 @@ ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_movk_i32 s32, 0x200 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX803-NEXT: s_endpgm @@ -203,12 +207,14 @@ ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_movk_i32 s32, 0x200 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_movk_i32 s32, 0x100 ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_addc_u32 s5, s5, 0 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 @@ -344,6 +350,7 @@ ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; GFX803-NEXT: s_mov_b32 s4, 0x40000 +; GFX803-NEXT: s_mov_b32 s32, 0x40100 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill @@ -363,6 +370,7 @@ ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; GFX900-NEXT: s_mov_b32 s6, 0x40000 +; GFX900-NEXT: s_mov_b32 s32, 0x40100 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: ;;#ASMSTART @@ -376,6 +384,7 @@ ; GFX1010-LABEL: test_sgpr_offset_kernel: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s32, 0x20080 ; GFX1010-NEXT: s_addc_u32 s5, s5, 0 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -198,6 +198,7 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_movk_i32 s32, 0x280 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -13,6 +13,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -58,6 +59,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -105,6 +107,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -152,6 +155,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -170,6 +174,7 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -260,6 +265,7 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -321,6 +327,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -372,6 +379,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -423,6 +431,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -473,6 +482,7 @@ ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -595,6 +605,7 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_movk_i32 s8, 0xff +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -657,6 +668,7 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -711,6 +723,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -765,6 +778,7 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 @@ -817,6 +831,7 @@ ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -939,6 +954,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -986,6 +1002,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1035,6 +1052,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1084,6 +1102,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1134,6 +1153,7 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1249,6 +1269,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -1294,6 +1315,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1341,6 +1363,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1388,6 +1411,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1406,6 +1430,7 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1464,6 +1489,7 @@ ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s15, s6, 0x40018 ; GFX7-NEXT: s_bfe_i32 s16, s6, 0x40014 @@ -1517,6 +1543,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1565,6 +1592,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -1632,6 +1660,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -1698,6 +1727,7 @@ ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -1810,6 +1840,7 @@ ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40000 @@ -1883,6 +1914,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1954,6 +1986,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2042,6 +2075,7 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -2128,6 +2162,7 @@ ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -13,6 +13,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -58,6 +59,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -105,6 +107,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -152,6 +155,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -170,6 +174,7 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -259,6 +264,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -304,6 +310,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -352,6 +359,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -400,6 +408,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -447,6 +456,7 @@ ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -562,6 +572,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -607,6 +618,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -655,6 +667,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -703,6 +716,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -750,6 +764,7 @@ ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -865,6 +880,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -911,6 +927,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -962,6 +979,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1013,6 +1031,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1063,6 +1082,7 @@ ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -1165,6 +1185,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -1211,6 +1232,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1262,6 +1284,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1313,6 +1336,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1363,6 +1387,7 @@ ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -1463,6 +1488,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -1510,6 +1536,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -1559,6 +1586,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -1608,6 +1636,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -1658,6 +1687,7 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1772,6 +1802,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -1817,6 +1848,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -1864,6 +1896,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -1911,6 +1944,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1929,6 +1963,7 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1983,6 +2018,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -2038,6 +2074,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2086,6 +2123,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -2141,6 +2179,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -2195,6 +2234,7 @@ ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -2285,6 +2325,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -2356,6 +2397,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,6 +2469,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2493,6 +2536,7 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -2557,6 +2601,7 @@ ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -2658,6 +2703,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s32, 0x300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -2704,6 +2750,7 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s32, 0x300 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2755,6 +2802,7 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s32, 0x300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2806,6 +2854,7 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s32, 0x300 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -2856,6 +2905,7 @@ ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s32, 0x180 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -18,6 +18,7 @@ ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s8, 0 @@ -89,6 +90,7 @@ ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -10,7 +10,8 @@ ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: v_mov_b32_e32 v0, +; VI-NEXT: s_movk_i32 s32, 0x4000 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; VI-NEXT: s_endpgm @@ -33,7 +34,7 @@ ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_next_free_sgpr 33 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 ; VI-NEXT: .amdhsa_float_round_mode_16_64 0 @@ -58,6 +59,7 @@ ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_movk_i32 s32, 0x4000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc @@ -79,7 +81,7 @@ ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_next_free_sgpr 33 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_float_round_mode_32 0 ; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0 @@ -110,6 +112,7 @@ ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: s_movk_i32 s32, 0x200 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_endpgm @@ -132,7 +135,7 @@ ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_next_free_sgpr 33 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 ; VI-NEXT: .amdhsa_float_round_mode_16_64 0 @@ -157,6 +160,7 @@ ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_movk_i32 s32, 0x200 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc @@ -178,7 +182,7 @@ ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_next_free_sgpr 33 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_float_round_mode_32 0 ; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0 @@ -209,6 +213,7 @@ ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: s_movk_i32 s32, 0x2000 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_endpgm @@ -231,7 +236,7 @@ ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 8 +; VI-NEXT: .amdhsa_next_free_sgpr 33 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 ; VI-NEXT: .amdhsa_float_round_mode_16_64 0 @@ -256,6 +261,7 @@ ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_movk_i32 s32, 0x2000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc @@ -277,7 +283,7 @@ ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 8 +; GFX9-NEXT: .amdhsa_next_free_sgpr 33 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_float_round_mode_32 0 ; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0