Index: llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -251,9 +251,19 @@ RPT.advanceToNext(); GCNRegPressure MaxPressure = RPT.moveMaxPressure(); unsigned Occupancy = MaxPressure.getOccupancy(*ST); + + // Don't push over half the register budget. We don't want to introduce + // spilling just to form a soft clause. + // + // FIXME: This pressure check is fundamentally broken. First, this is checking + // the global pressure, not the pressure at this specific point in the + // program. Second, it's not accounting for the increased liveness of the use + // operands due to the early clobber we will introduce. Third, the pressure + // tracking does not account for the alignment requirements for SGPRs, or the + // fragmentation of registers the allocator will need to satisfy. if (Occupancy >= MFI->getMinAllowedOccupancy() && - MaxPressure.getVGPRNum() <= MaxVGPRs && - MaxPressure.getSGPRNum() <= MaxSGPRs) { + MaxPressure.getVGPRNum() <= MaxVGPRs / 2 && + MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { LastRecordedOccupancy = Occupancy; return true; } Index: llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2697,20 +2697,20 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v32, 0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 @@ -2780,28 +2780,28 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v32, 0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: global_load_dword v33, v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: global_load_dword v32, v[0:1], off ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 Index: llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir @@ -0,0 +1,165 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack -run-pass=si-form-memory-clauses -verify-machineinstrs -o - %s | FileCheck %s + +# This previously would produce a bundle that could not be satisfied +# due to using nearly the entire register budget and not considering +# the alignment requirement of large SGPR tuples. + +--- +name: soft_clause_bundle_out_of_registers +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + waveLimiter: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + occupancy: 6 +body: | + ; CHECK-LABEL: name: soft_clause_bundle_out_of_registers + ; CHECK: early-clobber %4:sgpr_512, early-clobber %5:sgpr_512 = BUNDLE %3 { + ; CHECK: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 0, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4096, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LOAD_DWORDX16_IMM2:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8192, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX16_IMM3:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 12288, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX16_IMM4:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 64, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX16_IMM5:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4160, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX16_IMM6:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8256, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: dead $sgpr30_sgpr31 = SI_CALL undef $sgpr4_sgpr5, 0, csr_amdgpu_highregs, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr70, implicit-def $sgpr80, implicit-def $sgpr90, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6 + + %0:sgpr_64 = COPY $sgpr4_sgpr5 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 8, addrspace 4) + %2:vreg_64 = IMPLICIT_DEF + + bb.1: + undef %3.sub0:sreg_64 = S_ADD_U32 %1.sub0, 0, implicit-def $scc + %3.sub1:sreg_64 = S_ADDC_U32 %1.sub1, 0, implicit-def dead $scc, implicit killed $scc + %4:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 0, 0, 0 :: (load 64, align 4, addrspace 4) + %5:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4096, 0, 0 :: (load 64, align 4, addrspace 4) + %6:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8192, 0, 0 :: (load 64, align 4, addrspace 4) + %7:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 12288, 0, 0 :: (load 64, align 4, addrspace 4) + %8:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 64, 0, 0 :: (load 64, align 4, addrspace 4) + %9:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4160, 0, 0 :: (load 64, align 4, addrspace 4) + %10:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8256, 0, 0 :: (load 64, align 4, addrspace 4) + dead $sgpr30_sgpr31 = SI_CALL undef $sgpr4_sgpr5, 0, csr_amdgpu_highregs, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr70, implicit-def $sgpr80, implicit-def $sgpr90, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + dead %11:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub1, 0, 0, implicit $mode, implicit $exec + dead %12:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub2, 0, 0, implicit $mode, implicit $exec + dead %13:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub3, 0, 0, implicit $mode, implicit $exec + dead %14:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub4, 0, 0, implicit $mode, implicit $exec + dead %15:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub5, 0, 0, implicit $mode, implicit $exec + dead %16:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub6, 0, 0, implicit $mode, implicit $exec + dead %17:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub7, 0, 0, implicit $mode, implicit $exec + dead %18:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub8, 0, 0, implicit $mode, implicit $exec + dead %19:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub9, 0, 0, implicit $mode, implicit $exec + dead %20:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub10, 0, 0, implicit $mode, implicit $exec + dead %21:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub11, 0, 0, implicit $mode, implicit $exec + dead %22:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub12, 0, 0, implicit $mode, implicit $exec + dead %23:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub13, 0, 0, implicit $mode, implicit $exec + dead %24:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub14, 0, 0, implicit $mode, implicit $exec + dead %25:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %4.sub15, 0, 0, implicit $mode, implicit $exec + dead %26:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub0, 0, 0, implicit $mode, implicit $exec + dead %27:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub1, 0, 0, implicit $mode, implicit $exec + dead %28:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub2, 0, 0, implicit $mode, implicit $exec + dead %29:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub3, 0, 0, implicit $mode, implicit $exec + dead %30:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub4, 0, 0, implicit $mode, implicit $exec + dead %31:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub5, 0, 0, implicit $mode, implicit $exec + dead %32:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub6, 0, 0, implicit $mode, implicit $exec + dead %33:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub7, 0, 0, implicit $mode, implicit $exec + dead %34:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub8, 0, 0, implicit $mode, implicit $exec + dead %35:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub9, 0, 0, implicit $mode, implicit $exec + dead %36:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub10, 0, 0, implicit $mode, implicit $exec + dead %37:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub11, 0, 0, implicit $mode, implicit $exec + dead %38:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub12, 0, 0, implicit $mode, implicit $exec + dead %39:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub13, 0, 0, implicit $mode, implicit $exec + dead %40:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub14, 0, 0, implicit $mode, implicit $exec + dead %41:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %5.sub15, 0, 0, implicit $mode, implicit $exec + dead %42:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub0, 0, 0, implicit $mode, implicit $exec + dead %43:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub1, 0, 0, implicit $mode, implicit $exec + dead %44:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub2, 0, 0, implicit $mode, implicit $exec + dead %45:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub3, 0, 0, implicit $mode, implicit $exec + dead %46:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub4, 0, 0, implicit $mode, implicit $exec + dead %47:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub5, 0, 0, implicit $mode, implicit $exec + dead %48:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub8, 0, 0, implicit $mode, implicit $exec + dead %49:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub9, 0, 0, implicit $mode, implicit $exec + dead %50:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub10, 0, 0, implicit $mode, implicit $exec + dead %51:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub11, 0, 0, implicit $mode, implicit $exec + dead %52:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub12, 0, 0, implicit $mode, implicit $exec + dead %53:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub14, 0, 0, implicit $mode, implicit $exec + dead %54:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %8.sub15, 0, 0, implicit $mode, implicit $exec + dead %55:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %9.sub10, 0, 0, implicit $mode, implicit $exec + dead %56:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %6.sub0, 0, 0, implicit $mode, implicit $exec + dead %57:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %7.sub10, 0, 0, implicit $mode, implicit $exec + dead %58:vgpr_32 = nofpexcept V_ADD_F32_e64 0, 0, 0, %10.sub0, 0, 0, implicit $mode, implicit $exec + S_CMP_LG_U32 0, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... + +# Case with simple clause which exceeds the pressure limit, though +# didn't hit the register allocator error. + +--- +name: simple_huge_reg_tuple_clause +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + waveLimiter: false + memoryBound: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6 + ; CHECK-LABEL: name: simple_huge_reg_tuple_clause + ; CHECK: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; CHECK: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 2 + ; CHECK: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 3 + ; CHECK: [[S_MOV_B64_4:%[0-9]+]]:sreg_64 = S_MOV_B64 4 + ; CHECK: [[S_MOV_B64_5:%[0-9]+]]:sreg_64 = S_MOV_B64 5 + ; CHECK: [[S_MOV_B64_6:%[0-9]+]]:sreg_64 = S_MOV_B64 6 + ; CHECK: early-clobber %9:sgpr_512, early-clobber %8:sgpr_512 = BUNDLE [[COPY]] { + ; CHECK: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 64, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: } + ; CHECK: [[S_LOAD_DWORDX16_IMM2:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 4096, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX16_IMM3:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 4160, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM]] + ; CHECK: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM1]] + ; CHECK: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM2]] + ; CHECK: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM3]] + ; CHECK: S_NOP 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[S_MOV_B64_3]], implicit [[S_MOV_B64_4]] + ; CHECK: S_NOP 0, implicit [[S_MOV_B64_5]] + ; CHECK: S_NOP 0, implicit [[S_MOV_B64_6]] + ; CHECK: S_ENDPGM 0 + %0:sreg_64 = COPY $sgpr4_sgpr5 + %1:sreg_64 = S_MOV_B64 0 + %2:sreg_64 = S_MOV_B64 1 + %3:sreg_64 = S_MOV_B64 2 + %4:sreg_64 = S_MOV_B64 3 + %5:sreg_64 = S_MOV_B64 4 + %6:sreg_64 = S_MOV_B64 5 + %7:sreg_64 = S_MOV_B64 6 + %8:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 0, 0, 0 :: (load 64, align 4, addrspace 4) + %9:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 64, 0, 0 :: (load 64, align 4, addrspace 4) + %10:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 4096, 0, 0 :: (load 64, align 4, addrspace 4) + %11:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 4160, 0, 0 :: (load 64, align 4, addrspace 4) + S_NOP 0, implicit %8 + S_NOP 0, implicit %9 + S_NOP 0, implicit %10 + S_NOP 0, implicit %11 + S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5 + S_NOP 0, implicit %6 + S_NOP 0, implicit %7 + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -0,0 +1,638 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s + +; FIXME: The wide loads and bundles introduce so much spilling. +define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr) { +; CHECK-LABEL: excess_soft_clause_reg_pressure: +; CHECK: BB0_1: ; %for.cond28.preheader +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK: global_load_dword +; CHECK-NEXT: global_load_dword +; CHECK-NEXT: global_load_dword +; CHECK-NEXT: global_load_dword + +; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 + +; CHECK: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 + +; CHECK: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: v_writelane_b32 +; CHECK-NEXT: s_load_dwordx16 + +; CHECK: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 + +; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +; CHECK-NEXT: v_readlane_b32 +entry: + %i = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %i1 = bitcast i8 addrspace(4)* %i to i64 addrspace(4)* + %i2 = load i64, i64 addrspace(4)* %i1, align 8 + %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i4 = shl i32 %i3, 8 + %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5 + %i6 = add i32 %i4, %i5 + %i7 = trunc i64 %i2 to i32 + %conv = add i32 %i6, %i7 + %conv.frozen = freeze i32 %conv + %div = udiv i32 %conv.frozen, 49 + %add.ptr22 = getelementptr inbounds float, float addrspace(4)* %wei_ptr, i64 undef + br label %for.cond28.preheader + +for.cond28.preheader: ; preds = %for.cond28.preheader, %entry + %accum.sroa.110.0 = phi float [ 0.000000e+00, %entry ], [ %i251, %for.cond28.preheader ] + %accum.sroa.106.0 = phi float [ 0.000000e+00, %entry ], [ %i247, %for.cond28.preheader ] + %accum.sroa.102.0 = phi float [ 0.000000e+00, %entry ], [ %i243, %for.cond28.preheader ] + %accum.sroa.98.0 = phi float [ 0.000000e+00, %entry ], [ %i239, %for.cond28.preheader ] + %accum.sroa.94.0 = phi float [ 0.000000e+00, %entry ], [ %i235, %for.cond28.preheader ] + %accum.sroa.90.0 = phi float [ 0.000000e+00, %entry ], [ %i231, %for.cond28.preheader ] + %accum.sroa.86.0 = phi float [ 0.000000e+00, %entry ], [ %i227, %for.cond28.preheader ] + %accum.sroa.82.0 = phi float [ 0.000000e+00, %entry ], [ %i223, %for.cond28.preheader ] + %accum.sroa.78.0 = phi float [ 0.000000e+00, %entry ], [ %i219, %for.cond28.preheader ] + %accum.sroa.74.0 = phi float [ 0.000000e+00, %entry ], [ %i215, %for.cond28.preheader ] + %accum.sroa.70.0 = phi float [ 0.000000e+00, %entry ], [ %i211, %for.cond28.preheader ] + %accum.sroa.66.0 = phi float [ 0.000000e+00, %entry ], [ %i207, %for.cond28.preheader ] + %accum.sroa.62.0 = phi float [ 0.000000e+00, %entry ], [ %i203, %for.cond28.preheader ] + %accum.sroa.58.0 = phi float [ 0.000000e+00, %entry ], [ %i199, %for.cond28.preheader ] + %accum.sroa.54.0 = phi float [ 0.000000e+00, %entry ], [ %i195, %for.cond28.preheader ] + %accum.sroa.50.0 = phi float [ 0.000000e+00, %entry ], [ %i191, %for.cond28.preheader ] + %accum.sroa.46.0 = phi float [ 0.000000e+00, %entry ], [ %i187, %for.cond28.preheader ] + %accum.sroa.42.0 = phi float [ 0.000000e+00, %entry ], [ %i183, %for.cond28.preheader ] + %accum.sroa.38.0 = phi float [ 0.000000e+00, %entry ], [ %i179, %for.cond28.preheader ] + %accum.sroa.34.0 = phi float [ 0.000000e+00, %entry ], [ %i175, %for.cond28.preheader ] + %accum.sroa.30.0 = phi float [ 0.000000e+00, %entry ], [ %i171, %for.cond28.preheader ] + %accum.sroa.26.0 = phi float [ 0.000000e+00, %entry ], [ %i167, %for.cond28.preheader ] + %accum.sroa.22.0 = phi float [ 0.000000e+00, %entry ], [ %i163, %for.cond28.preheader ] + %accum.sroa.18.0 = phi float [ 0.000000e+00, %entry ], [ %i159, %for.cond28.preheader ] + %accum.sroa.14.0 = phi float [ 0.000000e+00, %entry ], [ %i155, %for.cond28.preheader ] + %accum.sroa.10.0 = phi float [ 0.000000e+00, %entry ], [ %i151, %for.cond28.preheader ] + %accum.sroa.6.0 = phi float [ 0.000000e+00, %entry ], [ %i147, %for.cond28.preheader ] + %accum.sroa.0.0 = phi float [ 0.000000e+00, %entry ], [ %i143, %for.cond28.preheader ] + %accum.sroa.114.0 = phi float [ 0.000000e+00, %entry ], [ %i255, %for.cond28.preheader ] + %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ] + %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ] + %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ] + %i_ptr.0288 = phi float addrspace(1)* [ undef, %entry ], [ %add.ptr47.3, %for.cond28.preheader ] + %w_ptr.0287 = phi float addrspace(4)* [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ] + %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ] + %i8 = load float, float addrspace(1)* %i_ptr.0288, align 4 + %add.ptr47 = getelementptr inbounds float, float addrspace(1)* %i_ptr.0288, i64 49 + %i9 = load float, float addrspace(1)* %add.ptr47, align 4 + %add.ptr47.1 = getelementptr inbounds float, float addrspace(1)* %i_ptr.0288, i64 98 + %i10 = load float, float addrspace(1)* %add.ptr47.1, align 4 + %add.ptr47.2 = getelementptr inbounds float, float addrspace(1)* %i_ptr.0288, i64 147 + %i11 = load float, float addrspace(1)* %add.ptr47.2, align 4 + %i12 = load float, float addrspace(4)* %w_ptr.0287, align 4 + %add.ptr66 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1024 + %i13 = load float, float addrspace(4)* %add.ptr66, align 4 + %add.ptr66.1 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2048 + %i14 = load float, float addrspace(4)* %add.ptr66.1, align 4 + %add.ptr66.2 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3072 + %i15 = load float, float addrspace(4)* %add.ptr66.2, align 4 + %add.ptr70 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1 + %i16 = load float, float addrspace(4)* %add.ptr70, align 4 + %add.ptr66.1291 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1025 + %i17 = load float, float addrspace(4)* %add.ptr66.1291, align 4 + %add.ptr66.1.1 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2049 + %i18 = load float, float addrspace(4)* %add.ptr66.1.1, align 4 + %add.ptr66.2.1 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3073 + %i19 = load float, float addrspace(4)* %add.ptr66.2.1, align 4 + %add.ptr70.1 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2 + %i20 = load float, float addrspace(4)* %add.ptr70.1, align 4 + %add.ptr66.2293 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1026 + %i21 = load float, float addrspace(4)* %add.ptr66.2293, align 4 + %add.ptr66.1.2 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2050 + %i22 = load float, float addrspace(4)* %add.ptr66.1.2, align 4 + %add.ptr66.2.2 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3074 + %i23 = load float, float addrspace(4)* %add.ptr66.2.2, align 4 + %add.ptr70.2 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3 + %i24 = load float, float addrspace(4)* %add.ptr70.2, align 4 + %add.ptr66.3 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1027 + %i25 = load float, float addrspace(4)* %add.ptr66.3, align 4 + %add.ptr66.1.3 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2051 + %i26 = load float, float addrspace(4)* %add.ptr66.1.3, align 4 + %add.ptr66.2.3 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3075 + %i27 = load float, float addrspace(4)* %add.ptr66.2.3, align 4 + %add.ptr70.3 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 4 + %i28 = load float, float addrspace(4)* %add.ptr70.3, align 4 + %add.ptr66.4 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1028 + %i29 = load float, float addrspace(4)* %add.ptr66.4, align 4 + %add.ptr66.1.4 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2052 + %i30 = load float, float addrspace(4)* %add.ptr66.1.4, align 4 + %add.ptr66.2.4 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3076 + %i31 = load float, float addrspace(4)* %add.ptr66.2.4, align 4 + %add.ptr70.4 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 5 + %i32 = load float, float addrspace(4)* %add.ptr70.4, align 4 + %add.ptr66.5 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1029 + %i33 = load float, float addrspace(4)* %add.ptr66.5, align 4 + %add.ptr66.1.5 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2053 + %i34 = load float, float addrspace(4)* %add.ptr66.1.5, align 4 + %add.ptr66.2.5 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3077 + %i35 = load float, float addrspace(4)* %add.ptr66.2.5, align 4 + %add.ptr70.5 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 6 + %i36 = load float, float addrspace(4)* %add.ptr70.5, align 4 + %add.ptr66.6 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1030 + %i37 = load float, float addrspace(4)* %add.ptr66.6, align 4 + %add.ptr66.1.6 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2054 + %i38 = load float, float addrspace(4)* %add.ptr66.1.6, align 4 + %add.ptr66.2.6 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3078 + %i39 = load float, float addrspace(4)* %add.ptr66.2.6, align 4 + %add.ptr70.6 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 7 + %i40 = load float, float addrspace(4)* %add.ptr70.6, align 4 + %add.ptr66.7 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1031 + %i41 = load float, float addrspace(4)* %add.ptr66.7, align 4 + %add.ptr66.1.7 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2055 + %i42 = load float, float addrspace(4)* %add.ptr66.1.7, align 4 + %add.ptr66.2.7 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3079 + %i43 = load float, float addrspace(4)* %add.ptr66.2.7, align 4 + %add.ptr70.7 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 8 + %i44 = load float, float addrspace(4)* %add.ptr70.7, align 4 + %add.ptr66.8 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1032 + %i45 = load float, float addrspace(4)* %add.ptr66.8, align 4 + %add.ptr66.1.8 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2056 + %i46 = load float, float addrspace(4)* %add.ptr66.1.8, align 4 + %add.ptr66.2.8 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3080 + %i47 = load float, float addrspace(4)* %add.ptr66.2.8, align 4 + %add.ptr70.8 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 9 + %i48 = load float, float addrspace(4)* %add.ptr70.8, align 4 + %add.ptr66.9 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1033 + %i49 = load float, float addrspace(4)* %add.ptr66.9, align 4 + %add.ptr66.1.9 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2057 + %i50 = load float, float addrspace(4)* %add.ptr66.1.9, align 4 + %add.ptr66.2.9 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3081 + %i51 = load float, float addrspace(4)* %add.ptr66.2.9, align 4 + %add.ptr70.9 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 10 + %i52 = load float, float addrspace(4)* %add.ptr70.9, align 4 + %add.ptr66.10 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1034 + %i53 = load float, float addrspace(4)* %add.ptr66.10, align 4 + %add.ptr66.1.10 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2058 + %i54 = load float, float addrspace(4)* %add.ptr66.1.10, align 4 + %add.ptr66.2.10 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3082 + %i55 = load float, float addrspace(4)* %add.ptr66.2.10, align 4 + %add.ptr70.10 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 11 + %i56 = load float, float addrspace(4)* %add.ptr70.10, align 4 + %add.ptr66.11 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1035 + %i57 = load float, float addrspace(4)* %add.ptr66.11, align 4 + %add.ptr66.1.11 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2059 + %i58 = load float, float addrspace(4)* %add.ptr66.1.11, align 4 + %add.ptr66.2.11 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3083 + %i59 = load float, float addrspace(4)* %add.ptr66.2.11, align 4 + %add.ptr70.11 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 12 + %i60 = load float, float addrspace(4)* %add.ptr70.11, align 4 + %add.ptr66.12 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1036 + %i61 = load float, float addrspace(4)* %add.ptr66.12, align 4 + %add.ptr66.1.12 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2060 + %i62 = load float, float addrspace(4)* %add.ptr66.1.12, align 4 + %add.ptr66.2.12 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3084 + %i63 = load float, float addrspace(4)* %add.ptr66.2.12, align 4 + %add.ptr70.12 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 13 + %i64 = load float, float addrspace(4)* %add.ptr70.12, align 4 + %add.ptr66.13 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1037 + %i65 = load float, float addrspace(4)* %add.ptr66.13, align 4 + %add.ptr66.1.13 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2061 + %i66 = load float, float addrspace(4)* %add.ptr66.1.13, align 4 + %add.ptr66.2.13 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3085 + %i67 = load float, float addrspace(4)* %add.ptr66.2.13, align 4 + %add.ptr70.13 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 14 + %i68 = load float, float addrspace(4)* %add.ptr70.13, align 4 + %add.ptr66.14 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1038 + %i69 = load float, float addrspace(4)* %add.ptr66.14, align 4 + %add.ptr66.1.14 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2062 + %i70 = load float, float addrspace(4)* %add.ptr66.1.14, align 4 + %add.ptr66.2.14 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3086 + %i71 = load float, float addrspace(4)* %add.ptr66.2.14, align 4 + %add.ptr70.14 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 15 + %i72 = load float, float addrspace(4)* %add.ptr70.14, align 4 + %add.ptr66.15 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1039 + %i73 = load float, float addrspace(4)* %add.ptr66.15, align 4 + %add.ptr66.1.15 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2063 + %i74 = load float, float addrspace(4)* %add.ptr66.1.15, align 4 + %add.ptr66.2.15 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3087 + %i75 = load float, float addrspace(4)* %add.ptr66.2.15, align 4 + %add.ptr70.15 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 16 + %i76 = load float, float addrspace(4)* %add.ptr70.15, align 4 + %add.ptr66.16 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1040 + %i77 = load float, float addrspace(4)* %add.ptr66.16, align 4 + %add.ptr66.1.16 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2064 + %i78 = load float, float addrspace(4)* %add.ptr66.1.16, align 4 + %add.ptr66.2.16 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3088 + %i79 = load float, float addrspace(4)* %add.ptr66.2.16, align 4 + %add.ptr70.16 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 17 + %i80 = load float, float addrspace(4)* %add.ptr70.16, align 4 + %add.ptr66.17 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1041 + %i81 = load float, float addrspace(4)* %add.ptr66.17, align 4 + %add.ptr66.1.17 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2065 + %i82 = load float, float addrspace(4)* %add.ptr66.1.17, align 4 + %add.ptr66.2.17 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3089 + %i83 = load float, float addrspace(4)* %add.ptr66.2.17, align 4 + %add.ptr70.17 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 18 + %i84 = load float, float addrspace(4)* %add.ptr70.17, align 4 + %add.ptr66.18 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1042 + %i85 = load float, float addrspace(4)* %add.ptr66.18, align 4 + %add.ptr66.1.18 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2066 + %i86 = load float, float addrspace(4)* %add.ptr66.1.18, align 4 + %add.ptr66.2.18 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3090 + %i87 = load float, float addrspace(4)* %add.ptr66.2.18, align 4 + %add.ptr70.18 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 19 + %i88 = load float, float addrspace(4)* %add.ptr70.18, align 4 + %add.ptr66.19 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1043 + %i89 = load float, float addrspace(4)* %add.ptr66.19, align 4 + %add.ptr66.1.19 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2067 + %i90 = load float, float addrspace(4)* %add.ptr66.1.19, align 4 + %add.ptr66.2.19 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3091 + %i91 = load float, float addrspace(4)* %add.ptr66.2.19, align 4 + %add.ptr70.19 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 20 + %i92 = load float, float addrspace(4)* %add.ptr70.19, align 4 + %add.ptr66.20 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1044 + %i93 = load float, float addrspace(4)* %add.ptr66.20, align 4 + %add.ptr66.1.20 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2068 + %i94 = load float, float addrspace(4)* %add.ptr66.1.20, align 4 + %add.ptr66.2.20 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3092 + %i95 = load float, float addrspace(4)* %add.ptr66.2.20, align 4 + %add.ptr70.20 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 21 + %i96 = load float, float addrspace(4)* %add.ptr70.20, align 4 + %add.ptr66.21 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1045 + %i97 = load float, float addrspace(4)* %add.ptr66.21, align 4 + %add.ptr66.1.21 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2069 + %i98 = load float, float addrspace(4)* %add.ptr66.1.21, align 4 + %add.ptr66.2.21 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3093 + %i99 = load float, float addrspace(4)* %add.ptr66.2.21, align 4 + %add.ptr70.21 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 22 + %i100 = load float, float addrspace(4)* %add.ptr70.21, align 4 + %add.ptr66.22 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1046 + %i101 = load float, float addrspace(4)* %add.ptr66.22, align 4 + %add.ptr66.1.22 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2070 + %i102 = load float, float addrspace(4)* %add.ptr66.1.22, align 4 + %add.ptr66.2.22 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3094 + %i103 = load float, float addrspace(4)* %add.ptr66.2.22, align 4 + %add.ptr70.22 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 23 + %i104 = load float, float addrspace(4)* %add.ptr70.22, align 4 + %add.ptr66.23 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1047 + %i105 = load float, float addrspace(4)* %add.ptr66.23, align 4 + %add.ptr66.1.23 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2071 + %i106 = load float, float addrspace(4)* %add.ptr66.1.23, align 4 + %add.ptr66.2.23 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3095 + %i107 = load float, float addrspace(4)* %add.ptr66.2.23, align 4 + %add.ptr70.23 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 24 + %i108 = load float, float addrspace(4)* %add.ptr70.23, align 4 + %add.ptr66.24 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1048 + %i109 = load float, float addrspace(4)* %add.ptr66.24, align 4 + %add.ptr66.1.24 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2072 + %i110 = load float, float addrspace(4)* %add.ptr66.1.24, align 4 + %add.ptr66.2.24 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3096 + %i111 = load float, float addrspace(4)* %add.ptr66.2.24, align 4 + %add.ptr70.24 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 25 + %i112 = load float, float addrspace(4)* %add.ptr70.24, align 4 + %add.ptr66.25 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1049 + %i113 = load float, float addrspace(4)* %add.ptr66.25, align 4 + %add.ptr66.1.25 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2073 + %i114 = load float, float addrspace(4)* %add.ptr66.1.25, align 4 + %add.ptr66.2.25 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3097 + %i115 = load float, float addrspace(4)* %add.ptr66.2.25, align 4 + %add.ptr70.25 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 26 + %i116 = load float, float addrspace(4)* %add.ptr70.25, align 4 + %add.ptr66.26 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1050 + %i117 = load float, float addrspace(4)* %add.ptr66.26, align 4 + %add.ptr66.1.26 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2074 + %i118 = load float, float addrspace(4)* %add.ptr66.1.26, align 4 + %add.ptr66.2.26 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3098 + %i119 = load float, float addrspace(4)* %add.ptr66.2.26, align 4 + %add.ptr70.26 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 27 + %i120 = load float, float addrspace(4)* %add.ptr70.26, align 4 + %add.ptr66.27 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1051 + %i121 = load float, float addrspace(4)* %add.ptr66.27, align 4 + %add.ptr66.1.27 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2075 + %i122 = load float, float addrspace(4)* %add.ptr66.1.27, align 4 + %add.ptr66.2.27 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3099 + %i123 = load float, float addrspace(4)* %add.ptr66.2.27, align 4 + %add.ptr70.27 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 28 + %i124 = load float, float addrspace(4)* %add.ptr70.27, align 4 + %add.ptr66.28 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1052 + %i125 = load float, float addrspace(4)* %add.ptr66.28, align 4 + %add.ptr66.1.28 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2076 + %i126 = load float, float addrspace(4)* %add.ptr66.1.28, align 4 + %add.ptr66.2.28 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3100 + %i127 = load float, float addrspace(4)* %add.ptr66.2.28, align 4 + %add.ptr70.28 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 29 + %i128 = load float, float addrspace(4)* %add.ptr70.28, align 4 + %add.ptr66.29 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1053 + %i129 = load float, float addrspace(4)* %add.ptr66.29, align 4 + %add.ptr66.1.29 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2077 + %i130 = load float, float addrspace(4)* %add.ptr66.1.29, align 4 + %add.ptr66.2.29 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3101 + %i131 = load float, float addrspace(4)* %add.ptr66.2.29, align 4 + %add.ptr70.29 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 30 + %i132 = load float, float addrspace(4)* %add.ptr70.29, align 4 + %add.ptr66.30 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1054 + %i133 = load float, float addrspace(4)* %add.ptr66.30, align 4 + %add.ptr66.1.30 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2078 + %i134 = load float, float addrspace(4)* %add.ptr66.1.30, align 4 + %add.ptr66.2.30 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3102 + %i135 = load float, float addrspace(4)* %add.ptr66.2.30, align 4 + %add.ptr70.30 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 31 + %i136 = load float, float addrspace(4)* %add.ptr70.30, align 4 + %add.ptr66.31 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1055 + %i137 = load float, float addrspace(4)* %add.ptr66.31, align 4 + %add.ptr66.1.31 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2079 + %i138 = load float, float addrspace(4)* %add.ptr66.1.31, align 4 + %add.ptr66.2.31 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3103 + %i139 = load float, float addrspace(4)* %add.ptr66.2.31, align 4 + %add.ptr47.3 = getelementptr inbounds float, float addrspace(1)* %i_ptr.0288, i64 196 + %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0) + %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140) + %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141) + %i143 = tail call float @llvm.fmuladd.f32(float %i11, float %i15, float %i142) + %i144 = tail call float @llvm.fmuladd.f32(float %i8, float %i16, float %accum.sroa.6.0) + %i145 = tail call float @llvm.fmuladd.f32(float %i9, float %i17, float %i144) + %i146 = tail call float @llvm.fmuladd.f32(float %i10, float %i18, float %i145) + %i147 = tail call float @llvm.fmuladd.f32(float %i11, float %i19, float %i146) + %i148 = tail call float @llvm.fmuladd.f32(float %i8, float %i20, float %accum.sroa.10.0) + %i149 = tail call float @llvm.fmuladd.f32(float %i9, float %i21, float %i148) + %i150 = tail call float @llvm.fmuladd.f32(float %i10, float %i22, float %i149) + %i151 = tail call float @llvm.fmuladd.f32(float %i11, float %i23, float %i150) + %i152 = tail call float @llvm.fmuladd.f32(float %i8, float %i24, float %accum.sroa.14.0) + %i153 = tail call float @llvm.fmuladd.f32(float %i9, float %i25, float %i152) + %i154 = tail call float @llvm.fmuladd.f32(float %i10, float %i26, float %i153) + %i155 = tail call float @llvm.fmuladd.f32(float %i11, float %i27, float %i154) + %i156 = tail call float @llvm.fmuladd.f32(float %i8, float %i28, float %accum.sroa.18.0) + %i157 = tail call float @llvm.fmuladd.f32(float %i9, float %i29, float %i156) + %i158 = tail call float @llvm.fmuladd.f32(float %i10, float %i30, float %i157) + %i159 = tail call float @llvm.fmuladd.f32(float %i11, float %i31, float %i158) + %i160 = tail call float @llvm.fmuladd.f32(float %i8, float %i32, float %accum.sroa.22.0) + %i161 = tail call float @llvm.fmuladd.f32(float %i9, float %i33, float %i160) + %i162 = tail call float @llvm.fmuladd.f32(float %i10, float %i34, float %i161) + %i163 = tail call float @llvm.fmuladd.f32(float %i11, float %i35, float %i162) + %i164 = tail call float @llvm.fmuladd.f32(float %i8, float %i36, float %accum.sroa.26.0) + %i165 = tail call float @llvm.fmuladd.f32(float %i9, float %i37, float %i164) + %i166 = tail call float @llvm.fmuladd.f32(float %i10, float %i38, float %i165) + %i167 = tail call float @llvm.fmuladd.f32(float %i11, float %i39, float %i166) + %i168 = tail call float @llvm.fmuladd.f32(float %i8, float %i40, float %accum.sroa.30.0) + %i169 = tail call float @llvm.fmuladd.f32(float %i9, float %i41, float %i168) + %i170 = tail call float @llvm.fmuladd.f32(float %i10, float %i42, float %i169) + %i171 = tail call float @llvm.fmuladd.f32(float %i11, float %i43, float %i170) + %i172 = tail call float @llvm.fmuladd.f32(float %i8, float %i44, float %accum.sroa.34.0) + %i173 = tail call float @llvm.fmuladd.f32(float %i9, float %i45, float %i172) + %i174 = tail call float @llvm.fmuladd.f32(float %i10, float %i46, float %i173) + %i175 = tail call float @llvm.fmuladd.f32(float %i11, float %i47, float %i174) + %i176 = tail call float @llvm.fmuladd.f32(float %i8, float %i48, float %accum.sroa.38.0) + %i177 = tail call float @llvm.fmuladd.f32(float %i9, float %i49, float %i176) + %i178 = tail call float @llvm.fmuladd.f32(float %i10, float %i50, float %i177) + %i179 = tail call float @llvm.fmuladd.f32(float %i11, float %i51, float %i178) + %i180 = tail call float @llvm.fmuladd.f32(float %i8, float %i52, float %accum.sroa.42.0) + %i181 = tail call float @llvm.fmuladd.f32(float %i9, float %i53, float %i180) + %i182 = tail call float @llvm.fmuladd.f32(float %i10, float %i54, float %i181) + %i183 = tail call float @llvm.fmuladd.f32(float %i11, float %i55, float %i182) + %i184 = tail call float @llvm.fmuladd.f32(float %i8, float %i56, float %accum.sroa.46.0) + %i185 = tail call float @llvm.fmuladd.f32(float %i9, float %i57, float %i184) + %i186 = tail call float @llvm.fmuladd.f32(float %i10, float %i58, float %i185) + %i187 = tail call float @llvm.fmuladd.f32(float %i11, float %i59, float %i186) + %i188 = tail call float @llvm.fmuladd.f32(float %i8, float %i60, float %accum.sroa.50.0) + %i189 = tail call float @llvm.fmuladd.f32(float %i9, float %i61, float %i188) + %i190 = tail call float @llvm.fmuladd.f32(float %i10, float %i62, float %i189) + %i191 = tail call float @llvm.fmuladd.f32(float %i11, float %i63, float %i190) + %i192 = tail call float @llvm.fmuladd.f32(float %i8, float %i64, float %accum.sroa.54.0) + %i193 = tail call float @llvm.fmuladd.f32(float %i9, float %i65, float %i192) + %i194 = tail call float @llvm.fmuladd.f32(float %i10, float %i66, float %i193) + %i195 = tail call float @llvm.fmuladd.f32(float %i11, float %i67, float %i194) + %i196 = tail call float @llvm.fmuladd.f32(float %i8, float %i68, float %accum.sroa.58.0) + %i197 = tail call float @llvm.fmuladd.f32(float %i9, float %i69, float %i196) + %i198 = tail call float @llvm.fmuladd.f32(float %i10, float %i70, float %i197) + %i199 = tail call float @llvm.fmuladd.f32(float %i11, float %i71, float %i198) + %i200 = tail call float @llvm.fmuladd.f32(float %i8, float %i72, float %accum.sroa.62.0) + %i201 = tail call float @llvm.fmuladd.f32(float %i9, float %i73, float %i200) + %i202 = tail call float @llvm.fmuladd.f32(float %i10, float %i74, float %i201) + %i203 = tail call float @llvm.fmuladd.f32(float %i11, float %i75, float %i202) + %i204 = tail call float @llvm.fmuladd.f32(float %i8, float %i76, float %accum.sroa.66.0) + %i205 = tail call float @llvm.fmuladd.f32(float %i9, float %i77, float %i204) + %i206 = tail call float @llvm.fmuladd.f32(float %i10, float %i78, float %i205) + %i207 = tail call float @llvm.fmuladd.f32(float %i11, float %i79, float %i206) + %i208 = tail call float @llvm.fmuladd.f32(float %i8, float %i80, float %accum.sroa.70.0) + %i209 = tail call float @llvm.fmuladd.f32(float %i9, float %i81, float %i208) + %i210 = tail call float @llvm.fmuladd.f32(float %i10, float %i82, float %i209) + %i211 = tail call float @llvm.fmuladd.f32(float %i11, float %i83, float %i210) + %i212 = tail call float @llvm.fmuladd.f32(float %i8, float %i84, float %accum.sroa.74.0) + %i213 = tail call float @llvm.fmuladd.f32(float %i9, float %i85, float %i212) + %i214 = tail call float @llvm.fmuladd.f32(float %i10, float %i86, float %i213) + %i215 = tail call float @llvm.fmuladd.f32(float %i11, float %i87, float %i214) + %i216 = tail call float @llvm.fmuladd.f32(float %i8, float %i88, float %accum.sroa.78.0) + %i217 = tail call float @llvm.fmuladd.f32(float %i9, float %i89, float %i216) + %i218 = tail call float @llvm.fmuladd.f32(float %i10, float %i90, float %i217) + %i219 = tail call float @llvm.fmuladd.f32(float %i11, float %i91, float %i218) + %i220 = tail call float @llvm.fmuladd.f32(float %i8, float %i92, float %accum.sroa.82.0) + %i221 = tail call float @llvm.fmuladd.f32(float %i9, float %i93, float %i220) + %i222 = tail call float @llvm.fmuladd.f32(float %i10, float %i94, float %i221) + %i223 = tail call float @llvm.fmuladd.f32(float %i11, float %i95, float %i222) + %i224 = tail call float @llvm.fmuladd.f32(float %i8, float %i96, float %accum.sroa.86.0) + %i225 = tail call float @llvm.fmuladd.f32(float %i9, float %i97, float %i224) + %i226 = tail call float @llvm.fmuladd.f32(float %i10, float %i98, float %i225) + %i227 = tail call float @llvm.fmuladd.f32(float %i11, float %i99, float %i226) + %i228 = tail call float @llvm.fmuladd.f32(float %i8, float %i100, float %accum.sroa.90.0) + %i229 = tail call float @llvm.fmuladd.f32(float %i9, float %i101, float %i228) + %i230 = tail call float @llvm.fmuladd.f32(float %i10, float %i102, float %i229) + %i231 = tail call float @llvm.fmuladd.f32(float %i11, float %i103, float %i230) + %i232 = tail call float @llvm.fmuladd.f32(float %i8, float %i104, float %accum.sroa.94.0) + %i233 = tail call float @llvm.fmuladd.f32(float %i9, float %i105, float %i232) + %i234 = tail call float @llvm.fmuladd.f32(float %i10, float %i106, float %i233) + %i235 = tail call float @llvm.fmuladd.f32(float %i11, float %i107, float %i234) + %i236 = tail call float @llvm.fmuladd.f32(float %i8, float %i108, float %accum.sroa.98.0) + %i237 = tail call float @llvm.fmuladd.f32(float %i9, float %i109, float %i236) + %i238 = tail call float @llvm.fmuladd.f32(float %i10, float %i110, float %i237) + %i239 = tail call float @llvm.fmuladd.f32(float %i11, float %i111, float %i238) + %i240 = tail call float @llvm.fmuladd.f32(float %i8, float %i112, float %accum.sroa.102.0) + %i241 = tail call float @llvm.fmuladd.f32(float %i9, float %i113, float %i240) + %i242 = tail call float @llvm.fmuladd.f32(float %i10, float %i114, float %i241) + %i243 = tail call float @llvm.fmuladd.f32(float %i11, float %i115, float %i242) + %i244 = tail call float @llvm.fmuladd.f32(float %i8, float %i116, float %accum.sroa.106.0) + %i245 = tail call float @llvm.fmuladd.f32(float %i9, float %i117, float %i244) + %i246 = tail call float @llvm.fmuladd.f32(float %i10, float %i118, float %i245) + %i247 = tail call float @llvm.fmuladd.f32(float %i11, float %i119, float %i246) + %i248 = tail call float @llvm.fmuladd.f32(float %i8, float %i120, float %accum.sroa.110.0) + %i249 = tail call float @llvm.fmuladd.f32(float %i9, float %i121, float %i248) + %i250 = tail call float @llvm.fmuladd.f32(float %i10, float %i122, float %i249) + %i251 = tail call float @llvm.fmuladd.f32(float %i11, float %i123, float %i250) + %i252 = tail call float @llvm.fmuladd.f32(float %i8, float %i124, float %accum.sroa.114.0) + %i253 = tail call float @llvm.fmuladd.f32(float %i9, float %i125, float %i252) + %i254 = tail call float @llvm.fmuladd.f32(float %i10, float %i126, float %i253) + %i255 = tail call float @llvm.fmuladd.f32(float %i11, float %i127, float %i254) + %i256 = tail call float @llvm.fmuladd.f32(float %i8, float %i128, float %accum.sroa.118.0) + %i257 = tail call float @llvm.fmuladd.f32(float %i9, float %i129, float %i256) + %i258 = tail call float @llvm.fmuladd.f32(float %i10, float %i130, float %i257) + %i259 = tail call float @llvm.fmuladd.f32(float %i11, float %i131, float %i258) + %i260 = tail call float @llvm.fmuladd.f32(float %i8, float %i132, float %accum.sroa.122.0) + %i261 = tail call float @llvm.fmuladd.f32(float %i9, float %i133, float %i260) + %i262 = tail call float @llvm.fmuladd.f32(float %i10, float %i134, float %i261) + %i263 = tail call float @llvm.fmuladd.f32(float %i11, float %i135, float %i262) + %i264 = tail call float @llvm.fmuladd.f32(float %i8, float %i136, float %accum.sroa.126.0) + %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264) + %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265) + %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266) + %add.ptr74 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 4096 + %inc116 = add nuw nsw i32 %ci.0286, 1 + %exitcond.not = icmp eq i32 %inc116, 512 + br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader + +for.cond.cleanup26: ; preds = %for.cond28.preheader + %mul119 = shl nuw nsw i32 undef, 1 + %mul120 = mul i32 %div, 200704 + %mul121 = mul i32 undef, 6272 + %add122 = add i32 %mul120, %mul121 + %mul123 = mul nuw nsw i32 undef, 28 + %add124 = add i32 %add122, %mul123 + %add126 = add i32 %add124, %mul119 + %idx.ext127 = zext i32 %add126 to i64 + %add.ptr128 = getelementptr inbounds float, float addrspace(1)* %out_ptr, i64 %idx.ext127 + store float %i143, float addrspace(1)* %add.ptr128, align 4 + %add.ptr184 = getelementptr inbounds float, float addrspace(1)* %add.ptr128, i64 196 + store float %i147, float addrspace(1)* %add.ptr184, align 4 + %add.ptr167.1 = getelementptr inbounds float, float addrspace(1)* %add.ptr184, i64 14 + store float 0.000000e+00, float addrspace(1)* %add.ptr167.1, align 4 + %add.ptr175.1.1 = getelementptr inbounds float, float addrspace(1)* %add.ptr167.1, i64 1 + store float 0.000000e+00, float addrspace(1)* %add.ptr175.1.1, align 4 + %add.ptr184.1 = getelementptr inbounds float, float addrspace(1)* %add.ptr184, i64 196 + store float %i151, float addrspace(1)* %add.ptr184.1, align 4 + %add.ptr184.2 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.1, i64 196 + store float %i155, float addrspace(1)* %add.ptr184.2, align 4 + %add.ptr184.3 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.2, i64 196 + store float %i159, float addrspace(1)* %add.ptr184.3, align 4 + %add.ptr184.4 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.3, i64 196 + store float %i163, float addrspace(1)* %add.ptr184.4, align 4 + %add.ptr154.5 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.4, i64 1 + store float 0.000000e+00, float addrspace(1)* %add.ptr154.5, align 4 + %add.ptr184.5 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.4, i64 196 + store float %i167, float addrspace(1)* %add.ptr184.5, align 4 + %add.ptr154.6 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.5, i64 1 + store float 0.000000e+00, float addrspace(1)* %add.ptr154.6, align 4 + %add.ptr184.6 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.5, i64 196 + store float %i171, float addrspace(1)* %add.ptr184.6, align 4 + %add.ptr184.7 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.6, i64 196 + store float %i175, float addrspace(1)* %add.ptr184.7, align 4 + %add.ptr167.8 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.7, i64 14 + store float 0.000000e+00, float addrspace(1)* %add.ptr167.8, align 4 + %add.ptr175.1.8 = getelementptr inbounds float, float addrspace(1)* %add.ptr167.8, i64 1 + store float 0.000000e+00, float addrspace(1)* %add.ptr175.1.8, align 4 + %add.ptr184.8 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.7, i64 196 + store float %i179, float addrspace(1)* %add.ptr184.8, align 4 + %add.ptr184.9 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.8, i64 196 + store float %i183, float addrspace(1)* %add.ptr184.9, align 4 + %add.ptr184.10 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.9, i64 196 + store float %i187, float addrspace(1)* %add.ptr184.10, align 4 + %add.ptr184.11 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.10, i64 196 + store float %i191, float addrspace(1)* %add.ptr184.11, align 4 + %add.ptr184.12 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.11, i64 196 + store float %i195, float addrspace(1)* %add.ptr184.12, align 4 + %add.ptr184.13 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.12, i64 196 + store float %i199, float addrspace(1)* %add.ptr184.13, align 4 + %add.ptr184.14 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.13, i64 196 + store float %i203, float addrspace(1)* %add.ptr184.14, align 4 + %add.ptr184.15 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.14, i64 196 + store float %i207, float addrspace(1)* %add.ptr184.15, align 4 + %add.ptr184.16 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.15, i64 196 + store float %i211, float addrspace(1)* %add.ptr184.16, align 4 + %add.ptr184.17 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.16, i64 196 + store float %i215, float addrspace(1)* %add.ptr184.17, align 4 + %add.ptr184.18 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.17, i64 196 + store float %i219, float addrspace(1)* %add.ptr184.18, align 4 + %add.ptr184.19 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.18, i64 196 + store float %i223, float addrspace(1)* %add.ptr184.19, align 4 + %add.ptr184.20 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.19, i64 196 + store float %i227, float addrspace(1)* %add.ptr184.20, align 4 + %add.ptr184.21 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.20, i64 196 + store float %i231, float addrspace(1)* %add.ptr184.21, align 4 + %add.ptr184.22 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.21, i64 196 + store float %i235, float addrspace(1)* %add.ptr184.22, align 4 + %add.ptr184.23 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.22, i64 196 + store float %i239, float addrspace(1)* %add.ptr184.23, align 4 + %add.ptr184.24 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.23, i64 196 + store float %i243, float addrspace(1)* %add.ptr184.24, align 4 + %add.ptr184.25 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.24, i64 196 + store float %i247, float addrspace(1)* %add.ptr184.25, align 4 + %add.ptr184.26 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.25, i64 196 + store float %i251, float addrspace(1)* %add.ptr184.26, align 4 + %add.ptr184.27 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.26, i64 196 + store float %i255, float addrspace(1)* %add.ptr184.27, align 4 + %add.ptr184.28 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.27, i64 196 + store float %i259, float addrspace(1)* %add.ptr184.28, align 4 + %add.ptr184.29 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.28, i64 196 + store float %i263, float addrspace(1)* %add.ptr184.29, align 4 + %add.ptr184.30 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.29, i64 196 + store float %i267, float addrspace(1)* %add.ptr184.30, align 4 + ret void +} + +declare float @llvm.fmuladd.f32(float, float, float) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.x() #1 +declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1 + +attributes #0 = { nofree nosync nounwind readnone speculatable willreturn } +attributes #1 = { nounwind readnone speculatable willreturn } + +!0 = !{i32 1, i32 2, i32 1, i32 0} +!1 = !{!"none", !"none", !"none", !"none"} +!2 = !{!"float*", !"float*", !"float*", !"float"} +!3 = !{!"restrict const", !"restrict const", !"restrict", !""} +!4 = !{i32 256, i32 1, i32 1} +!5 = !{i32 0, i32 1024} Index: llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -137,8 +137,8 @@ ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse ; GCN: NumVgprs: 10 -; GFX900: ScratchSize: 52 -; GFX908: ScratchSize: 20 +; GFX900: ScratchSize: 44 +; GFX908: ScratchSize: 12 ; GCN: VGPRBlocks: 2 ; GCN: NumVGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 { @@ -246,7 +246,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 1028 +; GFX900: ScratchSize: 1156 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 Index: llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -10,30 +10,31 @@ ; CHECK: bb.0..expVert: ; CHECK: liveins: $sgpr3, $sgpr4, $sgpr5, $sgpr8, $sgpr9, $sgpr10, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr25, $sgpr27, $sgpr31 ; CHECK: undef %56.sub0:sgpr_64 = COPY $sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr27 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr25 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr18 + ; CHECK: SI_SPILL_S32_SAVE $sgpr27, %stack.2, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) + ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr25 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr18 ; CHECK: undef %50.sub0:sgpr_64 = COPY $sgpr19 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr20 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr21 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr22 - ; CHECK: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23 - ; CHECK: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9 - ; CHECK: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 - ; CHECK: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr20 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr21 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr22 + ; CHECK: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr23 + ; CHECK: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr9 + ; CHECK: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr10 + ; CHECK: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0, 0 :: (load 8 from %ir.40, addrspace 4) ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; CHECK: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc - ; CHECK: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc - ; CHECK: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 4, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc - ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK: SI_SPILL_S32_SAVE [[S_AND_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.0, addrspace 5) ; CHECK: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc - ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY4]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: early-clobber %73:sgpr_128, early-clobber %143:sgpr_128, early-clobber %131:sreg_32_xm0_xexec = BUNDLE %130, undef %132:sgpr_128, undef %74:sreg_64 { ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) @@ -49,22 +50,20 @@ ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: } - ; CHECK: [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71 + ; CHECK: SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) ; CHECK: %71.sub1:sgpr_128 = S_MOV_B32 0 ; CHECK: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], 64, implicit-def $scc + ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY4]], 64, implicit-def $scc ; CHECK: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK: undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc ; CHECK: undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK: early-clobber %150:sgpr_128, early-clobber %157:sgpr_128 = BUNDLE %149, %156 { - ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) - ; CHECK: } + ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) ; CHECK: %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc ; CHECK: undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc @@ -92,67 +91,55 @@ ; CHECK: %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc ; CHECK: %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK: undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc - ; CHECK: early-clobber %379:sreg_32_xm0_xexec, early-clobber %201:sgpr_128, early-clobber %177:sgpr_128, early-clobber %184:sgpr_128, early-clobber %319:sreg_32_xm0_xexec, early-clobber %191:sgpr_128, early-clobber %309:sreg_32_xm0_xexec, early-clobber %323:sreg_32_xm0_xexec, early-clobber %368:sreg_32_xm0_xexec, early-clobber %313:sreg_32_xm0_xexec, early-clobber %211:sgpr_128 = BUNDLE [[S_ADD_I32_]], %71, undef %369:sgpr_128, %210, undef %314:sreg_32, %200, undef %380:sgpr_128, %176, %183, [[S_ADD_I32_1]], %190, undef %370:sreg_32 { - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 16, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0, 0 :: (load 16 from %ir.111, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0, 0 :: (load 16 from %ir.117, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) - ; CHECK: } - ; CHECK: early-clobber %151:vgpr_32, early-clobber %158:vgpr_32, early-clobber %165:vgpr_32 = BUNDLE [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], [[S_LOAD_DWORDX4_IMM3]], [[S_LOAD_DWORDX4_IMM4]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: } - ; CHECK: early-clobber %374:sreg_32_xm0_xexec, early-clobber %363:sreg_32_xm0_xexec = BUNDLE [[S_ADD_I32_]], undef %364:sgpr_128, undef %375:sgpr_128, [[S_ADD_I32_1]] { - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: } + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %71, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 16, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0, 0 :: (load 16 from %ir.111, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0, 0 :: (load 16 from %ir.117, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR]], -98, implicit-def dead $scc ; CHECK: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR1]], -114, implicit-def dead $scc ; CHECK: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR2]], -130, implicit-def dead $scc ; CHECK: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK: undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK: undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK: %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK: undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY9]], 4, implicit-def dead $scc ; CHECK: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: early-clobber %218:sgpr_128, early-clobber %225:sgpr_128, early-clobber %231:sgpr_128 = BUNDLE %217, %224, %50 { - ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) - ; CHECK: } + ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: early-clobber %254:sgpr_128, early-clobber %242:sgpr_128 = BUNDLE %253, %241 { - ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) - ; CHECK: } - ; CHECK: early-clobber %212:vgpr_32, early-clobber %202:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM8]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: } + ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -217, implicit-def dead $scc ; CHECK: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -233, implicit-def dead $scc ; CHECK: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR5]], -249, implicit-def dead $scc @@ -161,41 +148,35 @@ ; CHECK: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -329, implicit-def dead $scc ; CHECK: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -345, implicit-def dead $scc ; CHECK: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR6]], -441, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc + ; CHECK: [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 160, implicit-def $scc ; CHECK: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK: undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc ; CHECK: %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc ; CHECK: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc - ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK: undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc ; CHECK: %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: early-clobber %71.sub0:sgpr_128, early-clobber %262:sgpr_128 = BUNDLE %261, %441 { - ; CHECK: internal %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) - ; CHECK: } + ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0, 0 :: (load 16 from %ir.185, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: early-clobber %445:sreg_32_xm0_xexec, early-clobber %287:sgpr_128 = BUNDLE %71, %286 { - ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: } + ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0, 0 :: (load 16 from %ir.200, addrspace 4) - ; CHECK: early-clobber %281:vgpr_32, early-clobber %275:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], [[S_LOAD_DWORDX4_IMM16]], [[V_MOV_B32_e32_]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: } - ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc @@ -203,20 +184,16 @@ ; CHECK: %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %453, 0, 0, 0 :: (load 8 from %ir.304, addrspace 4) ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: early-clobber %336:sgpr_128, early-clobber %352:sgpr_128, early-clobber %328:sgpr_128, early-clobber %344:sgpr_128 = BUNDLE %327, %343, %335, %351 { - ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) - ; CHECK: } + ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: early-clobber %329:vgpr_32, early-clobber %345:vgpr_32, early-clobber %337:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM20]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], [[S_LOAD_DWORDX4_IMM21]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: } - ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 3, implicit-def dead $scc ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc @@ -229,7 +206,7 @@ ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) ; CHECK: } - ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc + ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 3, implicit-def dead $scc ; CHECK: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0, 0 :: (load 16 from %ir.287, addrspace 4) ; CHECK: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc @@ -248,7 +225,8 @@ ; CHECK: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -507, implicit-def dead $scc ; CHECK: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -539, implicit-def dead $scc ; CHECK: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc + ; CHECK: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) + ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[SI_SPILL_S32_RESTORE]], 96, implicit-def $scc ; CHECK: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK: %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc @@ -368,8 +346,13 @@ ; CHECK: [[V_OR_B32_e32_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_60]], [[V_ADD_U32_e32_25]], implicit $exec ; CHECK: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_61]], [[V_ADD_U32_e32_26]], implicit $exec - ; CHECK: [[COPY13]].sub1:sgpr_128 = COPY [[S_AND_B32_]] - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.0, addrspace 5) + ; CHECK: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5) + ; CHECK: undef %914.sub2_sub3:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub2_sub3 { + ; CHECK: internal %914.sub0:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0 + ; CHECK: } + ; CHECK: %914.sub1:sgpr_128 = COPY [[SI_SPILL_S32_RESTORE1]] + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %914, 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_62]], [[V_ADD_U32_e32_27]], implicit $exec ; CHECK: [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec