diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -62,6 +62,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") +BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") BUILTIN(__builtin_amdgcn_ds_gws_init, "vUiUi", "n") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -396,6 +396,19 @@ __builtin_amdgcn_wave_barrier(); } +// CHECK-LABEL: @test_sched_barrier +// CHECK: call void @llvm.amdgcn.sched.barrier(i32 0) +// CHECK: call void @llvm.amdgcn.sched.barrier(i32 1) +// CHECK: call void @llvm.amdgcn.sched.barrier(i32 4) +// CHECK: call void @llvm.amdgcn.sched.barrier(i32 15) +void test_sched_barrier() +{ + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_sched_barrier(1); + __builtin_amdgcn_sched_barrier(4); + __builtin_amdgcn_sched_barrier(15); +} + // CHECK-LABEL: @test_s_sleep // CHECK: call void @llvm.amdgcn.s.sleep(i32 1) // CHECK: call void @llvm.amdgcn.s.sleep(i32 15) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl --- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl @@ -60,6 +60,11 @@ __builtin_amdgcn_s_setprio(65536); // expected-warning {{implicit conversion from 'int' to 'short' changes value from 65536 to 0}} } +void test_sched_barrier(int x) +{ + __builtin_amdgcn_sched_barrier(x); // expected-error {{argument to '__builtin_amdgcn_sched_barrier' must be a constant integer}} +} + void test_sicmp_i32(global ulong* out, int a, int b, uint c) { *out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -213,6 +213,15 @@ def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; +// The 1st parameter is a mask for the types of instructions that may be allowed +// to cross the SCHED_BARRIER during scheduling. +// MASK = 0: No instructions may be scheduled across SCHED_BARRIER. +// MASK = 1: Non-memory, non-side-effect producing instructions may be +// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. +def int_amdgcn_sched_barrier : GCCBuiltin<"__builtin_amdgcn_sched_barrier">, + Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, + IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; + def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">, Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -207,6 +207,16 @@ return; } + if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) { + if (isVerbose()) { + std::string HexString; + raw_string_ostream HexStream(HexString); + HexStream << format_hex(MI->getOperand(0).getImm(), 10, true); + OutStreamer->emitRawComment(" sched_barrier mask(" + HexString + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) { if (isVerbose()) OutStreamer->emitRawComment(" divergent unreachable"); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1773,6 +1773,7 @@ // hazard, even if one exist, won't really be visible. Should we handle it? case AMDGPU::SI_MASKED_UNREACHABLE: case AMDGPU::WAVE_BARRIER: + case AMDGPU::SCHED_BARRIER: return 0; } } @@ -3490,6 +3491,9 @@ if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) return true; + if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0) + return true; + // Target-independent instructions do not have an implicit-use of EXEC, even // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -313,6 +313,18 @@ let Size = 0; } +def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask), + [(int_amdgcn_sched_barrier (i32 timm:$mask))]> { + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let isConvergent = 1; + let FixedSize = 1; + let Size = 0; +} + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -148,6 +148,7 @@ switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_s_barrier: case Intrinsic::amdgcn_wave_barrier: + case Intrinsic::amdgcn_sched_barrier: return false; default: break; diff --git a/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir b/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir @@ -1,21 +1,26 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-sched %s -o - | FileCheck -check-prefix=GCN %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s + +# WAVE_BARRIER and SI_MASKED_UNREACHABLE ect. are not really instructions. To +# fix the hazard (m0 def followed by V_INTERP), the compiler should insert a +# S_NOP. -# WAVE_BARRIER and SI_MASKED_UNREACHABLE are not really instructions. -# To fix the hazard (m0 def followed by V_INTERP), the scheduler -# should move another instruction into the slot. --- -# CHECK-LABEL: name: hazard_wave_barrier -# CHECK-LABEL: bb.0: -# GCN: $m0 = S_MOV_B32 killed renamable $sgpr0 -# GCN-NEXT: WAVE_BARRIER -# GCN-NEXT: S_MOV_B32 0 -# GCN-NEXT: V_INTERP_MOV_F32 name: hazard_wave_barrier tracksRegLiveness: true body: | bb.0: liveins: $sgpr0 + ; GCN-LABEL: name: hazard_wave_barrier + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $m0 = S_MOV_B32 killed renamable $sgpr0 + ; GCN-NEXT: WAVE_BARRIER + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec + ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: S_ENDPGM 0 $m0 = S_MOV_B32 killed renamable $sgpr0 WAVE_BARRIER renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec @@ -23,16 +28,24 @@ S_ENDPGM 0 ... -# GCN-LABEL: name: hazard-masked-unreachable -# CHECK-LABEL: bb.0: -# GCN: $m0 = S_MOV_B32 killed renamable $sgpr0 -# GCN-NEXT: SI_MASKED_UNREACHABLE -# GCN-NEXT: S_MOV_B32 0 -# GCN-NEXT: V_INTERP_MOV_F32 + --- name: hazard-masked-unreachable tracksRegLiveness: true body: | + ; GCN-LABEL: name: hazard-masked-unreachable + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $m0 = S_MOV_B32 killed renamable $sgpr0 + ; GCN-NEXT: SI_MASKED_UNREACHABLE + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec + ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: S_ENDPGM 0 bb.0: liveins: $sgpr0 @@ -43,3 +56,27 @@ bb.1: S_ENDPGM 0 ... + +--- +name: hazard_sched_barrier +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: hazard_sched_barrier + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $m0 = S_MOV_B32 killed renamable $sgpr0 + ; GCN-NEXT: SCHED_BARRIER 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec + ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: S_ENDPGM 0 + $m0 = S_MOV_B32 killed renamable $sgpr0 + SCHED_BARRIER 0 + renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec + renamable $sgpr1 = S_MOV_B32 0 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @test_wave_barrier() #0 { +; GCN-LABEL: test_wave_barrier: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: ; sched_barrier mask(0x00000000) +; GCN-NEXT: ; sched_barrier mask(0x00000001) +; GCN-NEXT: ; sched_barrier mask(0x00000004) +; GCN-NEXT: ; sched_barrier mask(0x0000000F) +; GCN-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.sched.barrier(i32 0) #1 + call void @llvm.amdgcn.sched.barrier(i32 1) #1 + call void @llvm.amdgcn.sched.barrier(i32 4) #1 + call void @llvm.amdgcn.sched.barrier(i32 15) #1 + ret void +} + +declare void @llvm.amdgcn.sched.barrier(i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sched_barrier.mir b/llvm/test/CodeGen/AMDGPU/sched_barrier.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched_barrier.mir @@ -0,0 +1,99 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: no_sched_barrier +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: no_sched_barrier + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +--- +name: sched_barrier_0 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_0 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: SCHED_BARRIER 0 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + SCHED_BARRIER 0 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +--- +name: sched_barrier_1 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_1 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: SCHED_BARRIER 1 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + SCHED_BARRIER 1 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +...